Preparing data analysis (数据分析准备)

# Load packages
library(ggtree)
library(Biostrings)
library(tidyverse)
library(phyloseq)
library(ggClusterNet)
library(EasyStat)
library(fs)
library(ggthemes)
library(RColorBrewer)
library(magrittr)
library(MicrobiotaProcess)
library(ggsignif)
library(ggtree)
library(ggtreeExtra)
library(ggstar)
library(MicrobiotaProcess)
library(ggnewscale)
library(grid)

# Reading the raw file (原始文件读取)
metadata = read.delim("data/metadata.tsv")
row.names(metadata) = metadata$SampleID
otutab = read.table("data/otutab.txt", header=T, row.names=1, sep="\t", 
                    comment.char="", stringsAsFactors = F)
taxonomy = read.table("data/taxonomy.txt", header=T, row.names=1, sep="\t", 
                      comment.char="", stringsAsFactors = F)

1.Community diversity analysis (1.群落多样性分析)

alpha diversity (alpha多样性)

# Load packages
library(reshape2)
library(vegan)
library(ggplot2)
library(ggpubr)
library(patchwork)
library(ape)
library(tidyr)
library(MMUPHin) # meta-analysis for microbiome data
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("MMUPHin")
library(magrittr)
library(dplyr)
library(scales)
library(multcompView)
library(ggsignif)
library(amplicon)
library(ggrepel)
library(rdacca.hp)
library(psych)
library(cowplot)

# Set plot theme
mytheme = theme_bw() + theme(text = element_text(family = "sans", size = 8))+
    theme(legend.position="none",
    legend.text = element_text(size=10),
    legend.title = element_blank(), 
    panel.background = element_blank(),
    panel.grid = element_blank(),
    axis.text.y = element_text(size=10, colour="black", family = "sans", angle = 0), 
    axis.text.x = element_text(size=10, colour="black", family = "sans", angle = 0, hjust = 0),
    axis.title= element_text(size=10, family = "sans"),
    strip.text.x = element_text(size=10, angle = 0),
    strip.text.y = element_text(size=10, angle = 0),
    plot.title = element_text(size=10, angle = 0),
    strip.background.x = element_rect(fill = "#E5E4E2", colour = "black", size = 0.2))+
    theme(axis.text.x=element_text(angle=0,vjust=1, hjust=0.6))+
    theme(axis.line = element_line(size = 0.1, colour = "black"))

alpha diversity calculation(alpha多样性)

#### α多样性(alpha diversity) ####
# 宏基因组metaphlan4结果(metaphlan4 results from metagenomic analysis)
# otutable <- read.table("data/taxonomy.tsv",header=T,sep='\t',stringsAsFactors = F)
# otutable <- separate(otutable, clade_name, c("Kingdom","Phylum","Class","Order","Family","Genus","Species","Taxonomy"),sep="\\|",extra = "drop", fill = "right")
# otutable <- otutable[-which(is.na(otutable$Taxonomy)),]

# 扩增子或kraken2结果(amplcon or metagenome kraken2 results)
# metadata = read.delim("data/metadata.tsv")
# row.names(metadata) = metadata$SampleID
otutab = read.table("data/otutab.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
otutab$Taxonomy <- rownames(otutab)
taxonomy = read.table("data/taxonomy.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
taxonomy$Taxonomy <- rownames(taxonomy)
otutable <- merge(taxonomy, otutab, by = "Taxonomy")
otutable <- otutable[, c(2:8, 1, 9:26)]

otutable = data.frame(otutable,stringsAsFactors = F) 
otutable[,9:ncol(otutable)] = as.data.frame(lapply(otutable[,9:ncol(otutable)],as.numeric))
#metadata = read.table(paste("data/group_new779.txt",sep=""), header=T, row.names=1, sep="\t", comment.char="")
metadata = read.delim("data/metadata.tsv")
#metadata$Group <- NULL
colnames(metadata)[2] <- 'Group'

# 宏基因组测序在Species水平计算alpha多样性(Calculating alpha diversity at the species level using metagenomic sequencing)
# i = 7
# 扩增子测序在Genus水平计算alpha多样性(Amplicon sequencing to calculate alpha diversity at the genus level)
i = 6
level = cbind(otutable[,i],otutable[,9:ncol(otutable)])
level = melt(level,id.vars= colnames(level)[1],
              measure.vars = colnames(level[,2:ncol(level)]),
              variable.name = "sample",value.name = "relative_abundance")
level = dcast(level, otutable[, i] ~ sample, fun.aggregate = sum)
# 转置(Transpose)
level = t(level)
colnames(level) = level[1,]
level = level[-1,]
level = data.frame(level,stringsAsFactors = F) 
colnames(level) = gsub('\\.','',colnames(level))
level1 = apply(level,2,as.numeric)
rownames(level1)=rownames(level)
level=level1

# 计算diversity(Calculate diversity)
level_diversity = data.frame(Sample_ID = colnames(otutable[9:ncol(otutable)]),
                              observed_species=specnumber(level),
                              #chao1 = estimateR(t(otutable[,-c(1:7)]))[2,], # kraken2 results or amplicon results
                              shannon=vegan::diversity(level, index="shannon"),
                              simpson=vegan::diversity(level, index="simpson"),
                              invsimpson=vegan::diversity(level, index="invsimpson"),
                              Pielou_evenness=vegan::diversity(level,
                              index="shannon")/log(specnumber(level)))
  
# 输出diversity table(Diversity table output)
write.table(level_diversity, file = paste0('results/Community_diversity_analysis/alpha_diversity/',colnames(otutable)[i],'_alpha_diversity.txt'),
            quote = FALSE, sep = "\t", row.names = FALSE,col.names = TRUE)

alpha diversity boxplots

Alpha diversity index plots for two group compare: NPC and Healthy controls

# Source and edited from package amplicon
alpha_boxplot2 <- function(alpha_div, metadata, index = "shannon", groupID = "group", levels = c(), outlier = FALSE){
  p_list = c("ggplot2", "dplyr", "multcompView")
    for (p in p_list) {
        if (!requireNamespace(p)) {
            install.packages(p)
        }
        suppressPackageStartupMessages(library(p, character.only = TRUE, 
            quietly = TRUE, warn.conflicts = FALSE))
    }
  idx = rownames(metadata) %in% rownames(alpha_div)
  metadata = metadata[idx, , drop = F]
  alpha_div = alpha_div
  idx = rownames(metadata) %in% rownames(alpha_div)
  metadata = metadata[idx, , drop = F]
  alpha_div = alpha_div[rownames(metadata), ]
  sampFile = as.data.frame(metadata[, groupID], row.names = row.names(metadata))
  df = cbind(alpha_div[rownames(sampFile), index], sampFile)
  colnames(df) = c(index, "group")
  max = max(df[, c(index)])
  min = min(df[, index])
  x = df[, c("group", index)]
  y = x %>% group_by(group) %>% summarise_(Max = paste("max(", index, ")", sep = ""))
  y = as.data.frame(y)
  rownames(y) = y$group
  df$y = y[as.character(df$group), ]$Max + (max - min) * 0.05
  levels(as.factor(df$group))
  df = df %>%
    mutate(group = ordered(df$group,levels=levels))
  df$class = index
  compaired = list(c(levels[1], levels[2]), c(levels[1], levels[3]), c(levels[2], levels[3]))
  #wt = wilcox.test(df[[index]] ~ df$group, alternative=c("two.sided"))
  #FDR = p.adjust(wt$p.value, method = "BH")
  p1 = ggplot(df, aes(x = group, y = .data[[index]])) +
    geom_jitter(aes(color=group),position = position_jitter(0.15), size = 0.3, alpha = 1) +
    geom_boxplot(position=position_dodge(width =0.4),width=0.5, size = 0.4,
               fill = "transparent", 
               outlier.shape = NA,
               linetype = "dashed", color = "black") +
    stat_boxplot(aes(ymin=..lower..,ymax=..upper..,
                   fill=group
                   ),
               color="black",
               fill = "transparent",position=position_dodge(width =0.4),width=0.5, size = 0.4,outlier.shape = NA)+
    stat_boxplot(geom = "errorbar",aes(ymin=..ymax..),
               width=0.18,color="black",size = 0.4)+
    stat_boxplot(geom = "errorbar",aes(ymax=..ymin..),
               width=0.18,color="black",size = 0.4)+
    labs(x = NULL, y = NULL, color = groupID) + 
    scale_y_continuous(labels = label_number(accuracy = 0.1)) +
    #scale_fill_manual(values = c("#74add1","#a60026"))+
    scale_fill_manual(values = c("#00C0D8","#FF6060", "lightblue"))+
    #scale_color_manual(values = c("#74add1","#a60026"))+
    scale_color_manual(values = c("#00C0D8","#FF6060","lightblue"))+
    geom_signif(comparisons = compaired,
              step_increase = 0.3,
              map_signif_level = F,
              test = wilcox.test,
              color = "black",
              size = 0.2,
              textsize = 3
              )+
    mytheme+
    facet_grid(.~class)
  p1
}

# Data
alpha_div <- level_diversity
alpha_div$Sample_ID = sub(".x","", alpha_div$Sample_ID)
rownames(alpha_div) = alpha_div$Sample_ID
metadata2 <- as.data.frame(metadata)
rownames(metadata2) <- metadata2$SampleID
metadata2$Group2 <- metadata2$Group

# Plot
p1 <- alpha_boxplot2(alpha_div, metadata2, index = "shannon", groupID = "Group2", levels = c("KO", "OE", "WT"))
p2 <- alpha_boxplot2(alpha_div, metadata2, index = "invsimpson", groupID = "Group2", levels = c("KO", "OE", "WT"))
p3 <- alpha_boxplot2(alpha_div, metadata2, index = "Pielou_evenness", groupID = "Group2", levels = c("KO", "OE", "WT"))

library(patchwork)
p_all_alpha = p1 + p2 + p3
ggsave(paste("results/Community_diversity_analysis/alpha_diversity/Alpha_diversity01",".pdf", sep=""), p_all_alpha, width=105 * 1.5, height=65 * 1.5, unit='mm')
p_all_alpha

alpha-rarefaction curve(alpha稀释曲线)

# 定义统计函数
# Define statistical functions
library(vegan)
library(picante)
 
# 读取 OTU 丰度表
# Read OTU abundance table
otu <- read.table("data/otutab.txt", header=T, row.names=1, sep="\t", 
                  comment.char="", stringsAsFactors = F)
otu <- t(otu)
 
##定义函数(Define functions)
#计算多种 Alpha 多样性指数,结果返回至向量
#Calculate multiple Alpha diversity indices and return the results to a vector
alpha_index <- function(x, method = 'richness', tree = NULL, base = exp(1)) {
    if (method == 'richness') result <- rowSums(x > 0) 
    else if (method == 'chao1') result <- estimateR(x)[2, ] 
    else if (method == 'ace') result <- estimateR(x)[4, ] 
    else if (method == 'shannon') result <- diversity(x, index = 'shannon', 
                                                      base = base)
    else if (method == 'simpson') result <- diversity(x, index = 'simpson')
    #Pielou 均匀度(Pielou evenness)
    else if (method == 'pielou') result <- 
        diversity(x, index = 'shannon',base = base) / log(estimateR(x)[1, ], base)  
    else if (method == 'gc') result <- 1 - rowSums(x == 1) / rowSums(x) #goods_coverage
    else if (method == 'pd' & !is.null(tree)) { #PD_whole_tree
        pd <- pd(x, tree, include.root = FALSE)
        result <- pd[ ,1]
        names(result) <- rownames(pd)
    }
    result
}
 
# 根据抽样步长(step),统计每个稀释梯度下的 Alpha 多样性指数,结果返回至列表
# According to the sampling step, count the Alpha diversity index under each dilution gradient, and return the results to the list
alpha_curves <- function(x, step, method = 'richness', rare = NULL, tree = NULL, base = exp(1)) {
    x_nrow <- nrow(x)
    if (is.null(rare)) rare <- rowSums(x) else rare <- rep(rare, x_nrow)
    alpha_rare <- list()
    
    for (i in 1:x_nrow) {
        step_num <- seq(0, rare[i], step)
        if (max(step_num) < rare[i]) step_num <- c(step_num, rare[i])
        alpha_rare_i <- NULL
        for (step_num_n in step_num) alpha_rare_i <- 
          c(alpha_rare_i, alpha_index(x = rrarefy(x[i, ], step_num_n), 
                                      method = method, tree = tree, base = base))
        names(alpha_rare_i) <- step_num
        alpha_rare <- c(alpha_rare, list(alpha_rare_i))
    }
    names(alpha_rare) <- rownames(x)
    alpha_rare
}
 
## 测试(Test)
# 统计 OTU 丰度表中各样本的 Shannon 指数,对数底数使用 e
# Count the Shannon index of each sample in the OTU abundance table, and use e as the logarithmic base
shannon_index <- alpha_index(otu, method = 'shannon', base = exp(1))
# 以 1000 条序列为抽样步长,依次对 OTU 表稀释抽样,直到最大序列深度;并统计各抽样梯度下的 OTU 丰度表中各样本的 Shannon 指数,对数底数使用 e
# Using 1000 sequences as the sampling step, the OTU table was diluted and sampled in sequence until the maximum sequence depth was reached; and the Shannon index of each sample in the OTU abundance table under each sampling gradient was calculated, and the logarithmic base was e
shannon_curves <- alpha_curves(otu, step = 1000, method = 'shannon', base = exp(1))
# shannon_curves
# 以 2000 条序列为一抽样深度(步长)
# Take 2000 sequences as a sampling depth (step length)
rarecurve(otu, step = 2000, col = c('red', 'green', 'blue', 'orange', 'purple', 'black'))

## Richness指数曲线(Richness index curve)
# 以下以物种丰富度指数为例绘制 Alpha 多样性曲线(The following is an example of drawing the Alpha diversity curve using the species richness index as an example)
# 以 2000 步长(step=2000)为例统计(Take 2000 steps (step=2000) as an example)
richness_curves <- alpha_curves(otu, step = 2000, method = 'richness')
 
# 获得 ggplot2作图文件(Plot)
plot_richness <- data.frame()
for (i in names(richness_curves)) {
    richness_curves_i <- (richness_curves[[i]])
    richness_curves_i <- data.frame(rare = names(richness_curves_i), alpha = richness_curves_i, sample = i, stringsAsFactors = FALSE)
    plot_richness <- rbind(plot_richness, richness_curves_i)
}
 
rownames(plot_richness) <- NULL
plot_richness$rare <- as.numeric(plot_richness$rare)
plot_richness$alpha <- as.numeric(plot_richness$alpha)
 
# ggplot2
library(ggplot2) 
p1 <- ggplot(plot_richness, aes(rare, alpha, color = sample)) +
  geom_line() +
  labs(x = 'Number of sequences', y = 'Richness', color = NULL) +
  theme(panel.grid = element_blank(), 
        panel.background = element_rect(fill = 'transparent', color = 'black'), 
        legend.key = element_rect(fill = 'transparent')) +
  geom_vline(xintercept = min(rowSums(otu)), linetype = 2) +
  scale_x_continuous(breaks = seq(0, 30000, 5000), labels = as.character(seq(0, 30000, 5000)))
ggsave(paste("results/Community_diversity_analysis/alpha_diversity/Alpha_rarefaction_curve01",".pdf", sep=""), p1, width=105 * 1.5, height=65 * 1.5, unit='mm')

## 多计算几次以获取均值 ± 标准差(Calculate several times to get the mean ± standard deviation)
# 重复抽样 5 次(Repeat sampling 5 times)
plot_richness <- data.frame()
for (n in 1:5) {
    richness_curves <- alpha_curves(otu, step = 2000, method = 'richness')
    for (i in names(richness_curves)) {
        richness_curves_i <- (richness_curves[[i]])
        richness_curves_i <- data.frame(rare = names(richness_curves_i), alpha = richness_curves_i, sample = i, stringsAsFactors = FALSE)
        plot_richness <- rbind(plot_richness, richness_curves_i)
    }
}
 
# 计算均值 ± 标准差(doBy 包中的 summaryBy() 函数)(Calculate mean ± standard deviation (summaryBy() function in doBy package))
library(doBy)
plot_richness_stat <- summaryBy(alpha~sample+rare, plot_richness, FUN = c(mean, sd))
plot_richness_stat$rare <- as.numeric(plot_richness_stat$rare)
plot_richness_stat[which(plot_richness_stat$rare == 0),'alpha.sd'] <- NA
 
# ggplot2
p2 <- ggplot(plot_richness_stat, aes(rare, alpha.mean, color = sample)) +
  geom_line() +
  geom_point() + geom_errorbar(aes(ymin = alpha.mean - alpha.sd, ymax = alpha.mean + alpha.sd), width = 500) +
  labs(x = 'Number of sequences', y = 'Richness', color = NULL) +
  theme(panel.grid = element_blank(), panel.background = element_rect(fill = 'transparent', color = 'black'), 
        legend.key = element_rect(fill = 'transparent')) +
  geom_vline(xintercept = min(rowSums(otu)), linetype = 2) +
  scale_x_continuous(breaks = seq(0, 30000, 5000), labels = as.character(seq(0, 30000, 5000)))
ggsave(paste("results/Community_diversity_analysis/alpha_diversity/Alpha_rarefaction_curve02",".pdf", sep=""), p2, width=105 * 1.5, height=65 * 1.5, unit='mm')


## 其它Alpha多样性指数曲线(Other Alpha Diversity Index Curves)
## Shannon指数曲线(Shannon index curve)
# 若简单的“geom_line()”样式波动幅度过大,不平滑等,可以尝试拟合曲线的样式
# If the simple "geom_line()" style fluctuates too much, is not smooth, etc., you can try the style of fitting the curve
# 获得作图数据。前面多生成一个点,目的使 Shannon 拟合曲线更加平滑
# Get the plotting data. Generate one more point in the front to make the Shannon fitting curve smoother
shannon_curves1 <- alpha_curves(otu, step = 200, rare = 200, method = 'shannon')
shannon_curves2 <- alpha_curves(otu, step = 2000, method = 'shannon')
shannon_curves <- c(shannon_curves1, shannon_curves2)
 
plot_shannon <- data.frame()
for (i in 1:length(shannon_curves)) {
    shannon_curves_i <- shannon_curves[[i]]
    shannon_curves_i <- data.frame(rare = names(shannon_curves_i), alpha = shannon_curves_i, sample = names(shannon_curves)[i], stringsAsFactors = FALSE)
    plot_shannon <- rbind(plot_shannon, shannon_curves_i)
}
 
rownames(plot_shannon) <- NULL
plot_shannon$rare <- as.numeric(plot_shannon$rare)
plot_shannon$alpha <- as.numeric(plot_shannon$alpha)
plot_shannon <- plot_shannon[order(plot_shannon$sample, plot_shannon$rare), ]
 
# ggplot2
library(ggalt)
p3 <- ggplot(plot_shannon, aes(rare, alpha, color = sample)) +
  geom_xspline() +
  labs(x = 'Number of sequences', y = 'Shannon', color = NULL) +
  theme(panel.grid = element_blank(), panel.background = element_rect(fill = 'transparent', color = 'black'), 
        legend.key = element_rect(fill = 'transparent')) +
  geom_vline(xintercept = min(rowSums(otu)), linetype = 2) +
  scale_x_continuous(breaks = seq(0, 30000, 5000), labels = as.character(seq(0, 30000, 5000)))
ggsave(paste("results/Community_diversity_analysis/alpha_diversity/Alpha_rarefaction_curve03",".pdf", sep=""), p3, width=105 * 1.5, height=65 * 1.5, unit='mm')


## PD_whole_tree曲线(PD_whole_tree curve)
# 对于 PD_whole_tree,除了 OTU 丰度表,还使用到进化树文件
# For PD_whole_tree, in addition to the OTU abundance table, the phylogenetic tree file is also used
# 加载 OTU 丰度表和进化树文件(Load OTU abundance table and phylogenetic tree files)
otu <- read.table("data/otutab.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
otu <- t(otu)
#tree <- read.tree('otu_tree.tre')
 
# 以2000步长(step=2000)为例统计(Take 2000 steps (step=2000) as an example)
pd_curves <- alpha_curves(otu, tree = tree, step = 2000, method = 'pd')
library(ggtree)
library(ape)
tree <- read.tree("data/otus.tree")

# 以 2000 步长(step=2000)为例统计(Take 2000 steps (step=2000) as an example)
pd_curves <- alpha_curves(otu, tree = tree, step = 2000, method = 'pd')

plot_pd <- data.frame()
for (i in 1:length(pd_curves)) {
    pd_curves_i <- pd_curves[[i]]
    pd_curves_i <- data.frame(rare = names(pd_curves_i), alpha = pd_curves_i, sample = names(pd_curves)[i], stringsAsFactors = FALSE)
    plot_pd <- rbind(plot_pd, pd_curves_i)
}

rownames(plot_pd) <- NULL
plot_pd$rare <- as.numeric(plot_pd$rare)
plot_pd$alpha <- as.numeric(plot_pd$alpha)
plot_pd <- plot_pd[order(plot_pd$sample, plot_pd$rare), ]

# ggplot2
library(ggalt)
p4 <- ggplot(plot_pd, aes(rare, alpha, color = sample)) +
  geom_xspline() +
  labs(x = 'Number of sequences', y = 'PD_whole_tree', color = NULL) +
  theme(panel.grid = element_blank(), panel.background = element_rect(fill = 'transparent', color = 'black'), 
        legend.key = element_rect(fill = 'transparent')) +
  geom_vline(xintercept = min(rowSums(otu)), linetype = 2) +
  scale_x_continuous(breaks = seq(0, 30000, 5000), labels = as.character(seq(0, 30000, 5000)))
ggsave(paste("results/Community_diversity_analysis/alpha_diversity/Alpha_rarefaction_curve04",".pdf", sep=""), p4, width=105 * 1.5, height=65 * 1.5, unit='mm')

beta diversity calculation(beta多样性)

#### β多样性(beta diversity) ####
# metaphlan4结果(metaphalan4 results using metagenomic analysis)
# otutable <- read.table("data/taxonomy.tsv",header=T,sep='\t',stringsAsFactors = F)
# otutable <- separate(otutable, clade_name, c("Kingdom","Phylum","Class","Order","Family","Genus","Species","Taxonomy"),sep="\\|",extra = "drop", fill = "right")
# otutable <- otutable[-which(is.na(otutable$Taxonomy)),]

# 扩增子或kraken2结果(amplicon or kraken2 results)
#metadata = read.delim("data/metadata.tsv")
#row.names(metadata) = metadata$SampleID
otutab = read.table("data/otutab.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
otutab$Taxonomy <- rownames(otutab)
taxonomy = read.table("data/taxonomy.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
taxonomy = taxonomy[, 1:7]
taxonomy$Taxonomy <- rownames(taxonomy)
otutable <- merge(taxonomy, otutab, by = "Taxonomy")
otutable <- otutable[, c(2:8, 1, 9:26)]

otutable = data.frame(otutable,stringsAsFactors = F) 
otutable[,9:ncol(otutable)] = as.data.frame(lapply(otutable[,9:ncol(otutable)],as.numeric))
#metadata = read.table(paste("data/group_new779.txt",sep=""), header=T, row.names=1, sep="\t", comment.char="")
metadata = read.delim("data/metadata.tsv")
#metadata$Group <- NULL
colnames(metadata)[2] <- 'Group'

#### β多样性(beta diversity) ####
i=6 #选取Genus水平进行β多样性分析(Select Genus level for β diversity analysis)
level = cbind(otutable[,i],otutable[,9:ncol(otutable)])
level = melt(level,id.vars= colnames(level)[1],
             measure.vars = colnames(level[,2:ncol(level)]),
             variable.name = "sample",value.name = "relative_abundance")
level = dcast(level, otutable[, i] ~ sample, fun.aggregate = sum)
# 转置(Transpose)
level = t(level)
colnames(level) = level[1,]
level = level[-1,]
level = data.frame(level,stringsAsFactors = F) 
colnames(level) = gsub('\\.','',colnames(level))
level1 = apply(level,2,as.numeric)
rownames(level1)=rownames(level)
level=level1

DCA=decorana(level)
# DCA=summary(DCA)
# 看DCA1的Axis lengths,如果大于4.0,选择CCA分析,如果3.0-4.0,RDA和CCA均可,如果小于3.0,RDA更好
# Look at the Axis lengths of DCA1. If it is greater than 4.0, choose CCA analysis. If it is 3.0-4.0, both RDA and CCA are acceptable. If it is less than 3.0, RDA is better.

RA <- otutable
RA[,9:ncol(RA)] <- apply(RA[,9:ncol(RA)],2, function(x) x / sum(x) )

level = cbind(RA[,i],RA[,9:ncol(RA)])
level = melt(level,id.vars= colnames(level)[1],
              measure.vars = colnames(level[,2:ncol(level)]),
              variable.name = "Sample_ID",value.name = "relative_abundance")
level = dcast(level, RA[, i] ~ Sample_ID, fun.aggregate = sum)
# 转置(Transpose)
level = t(level)
colnames(level) = level[1,]
level = level[-1,]
level = data.frame(level,stringsAsFactors = F) 
colnames(level) = gsub('\\.','',colnames(level))
level1 = apply(level,2,as.numeric)
rownames(level1)=rownames(level)
level <- as.data.frame(level1)

level.reset = level
level.reset$Sample_ID <- rownames(level.reset)
metadata$Sample_ID <- metadata$SampleID
level.reset <- merge(metadata, level.reset, by='Sample_ID')
colnames(level.reset)[3] = 'Group2'

rownames(level.reset) = level.reset[,1]

# 生成β多样性距离矩阵,method默认为'bray',即bray curtis法
# Generate beta diversity distance matrix, method defaults to 'bray', i.e. Bray Curtis method
level_distance = vegdist(level.reset[,-c(1:12)])  
level_distance = as.matrix(level_distance)
# 输出bray curtis矩阵(Output bray curtis matrix)
write.table(level_distance, file = paste0('results/Community_diversity_analysis/beta_diversity/',colnames(otutable)[i],'_beta_diversity.txt'),
            quote = FALSE, sep = "\t", row.names = FALSE,col.names = TRUE)

Beta diversity PCoA Functions to plot pcoa plots

# Source and edited from package amplicon
beta_pcoa2 = function (dis_mat, metadata, groupID = "Group", groupID2 = "Group", levels = c(), ellipse = T, 
    label = F, PCo = 12) 
  {
    p_list = c("ggplot2", "vegan", "ggrepel")
    for (p in p_list) {
        if (!requireNamespace(p)) {
            install.packages(p)
        }
        suppressWarnings(suppressMessages(library(p, character.only = T)))
    }
    idx = rownames(metadata) %in% rownames(dis_mat)
    metadata = metadata[idx, , drop = F]
    dis_mat = dis_mat[rownames(metadata), rownames(metadata)]
    sampFile = as.data.frame(metadata[, groupID], row.names = row.names(metadata))
    sampFile2 = as.data.frame(metadata[, groupID2], row.names = row.names(metadata))
    pcoa = cmdscale(dis_mat, k = 3, eig = T)
    points = as.data.frame(pcoa$points)
    eig = pcoa$eig
    points = cbind(points, sampFile[rownames(points), ])
    points = cbind(points, sampFile2[rownames(points), ])
    colnames(points) = c("x", "y", "z", "group", "group2")
    points$group2 = metadata$Group
    points$x = points$x
    #points$y = -points$y
    levels(as.factor(points$group))
    points = points %>%
        mutate(group = ordered(points$group,
                         #levels=c("NPC", "Control")
                         levels = levels
                         ))
    if (PCo == 12) {
        p = ggplot(points, aes(x = x, y = y, color = group, shape = group2)) + #, shape = group2
            labs(x = paste("PCo axis 1 (", format(100 * eig[1]/sum(eig), 
                digits = 4), "%)", sep = ""), y = paste("PCo axis 2 (", 
                format(100 * eig[2]/sum(eig), digits = 4), "%)", 
                sep = ""), color = groupID)
    }
    if (PCo == 13) {
        p = ggplot(points, aes(x = x, y = z, color = group, shape = group2)) + 
            labs(x = paste("PCo axis 1 (", format(100 * eig[1]/sum(eig), 
                digits = 4), "%)", sep = ""), y = paste("PCo axis 3 (", 
                format(100 * eig[2]/sum(eig), digits = 4), "%)", 
                sep = ""), color = groupID)
    }
    if (PCo == 23) {
        p = ggplot(points, aes(x = y, y = z, color = group, shape = group2)) + 
            labs(x = paste("PCo axis 2 (", format(100 * eig[1]/sum(eig), 
                digits = 4), "%)", sep = ""), y = paste("PCo axis 3 (", 
                format(100 * eig[2]/sum(eig), digits = 4), "%)", 
                sep = ""), color = groupID)
    }
    p = p + geom_point(alpha = 0.7, size = 0.6) + theme_classic() + 
        theme(text = element_text(family = "sans", size = 7)
              )+
      scale_fill_manual(values = c("#00C0D8","#FF6060", "lightblue"))+
      scale_color_manual(values = c("#00C0D8","#FF6060", "lightblue"))+
      scale_shape_manual(values = c(19, 19, 19))#+
      coord_fixed(ratio = 1)
    if (ellipse == T) {
        p = p + 
          stat_ellipse(data = filter(points, group == levels[1]), aes(group = group), level = 0.78, size = 0.4)+
          stat_ellipse(data = filter(points, group == levels[2]), aes(group = group), level = 0.78, size = 0.4)+
          stat_ellipse(data = filter(points, group == levels[3]), aes(group = group), level = 0.78, size = 0.4)+
          theme(text = element_text(family = "sans", size = 7))+theme(legend.position="none",
    legend.text = element_text(size=10),
    legend.title = element_blank(),
    axis.text.y = element_text(size=10, colour="black", family = "sans", angle = 0),
    axis.text.x = element_text(size=10, colour="black", family = "sans", angle = 0, hjust = 0),
    axis.title= element_text(size=10)
    )+
    theme(axis.text.x=element_text(angle=0,vjust=1, hjust=0.6))+
    theme(axis.line = element_line(size = 0.2, colour = "black"))
    }
    if (label == T) {
        p = p + geom_text_repel(label = paste(rownames(points)),
            colour = "black", size = 2)
    }
    p
}

beta diversity boxplots (beta多样性)

# Genus
distance_mat = level_distance

# typeof(distance_mat)
rownames(metadata) <- metadata$SampleID

# metadata2 = t(metadata)
metadata2 = metadata
distance_mat2 = distance_mat[rownames(distance_mat) %in% rownames(metadata2), ]
distance_mat3 = distance_mat2[, colnames(distance_mat2) %in% rownames(metadata2)]
distance_mat = distance_mat3

rows01 = rownames(metadata2)
distance_mat2 = distance_mat[rows01, ]
distance_mat2 = t(distance_mat2)
distance_mat3 = distance_mat2[rows01, ]
distance_mat = t(distance_mat3)

# Plotting Constrained PCoA based on distance matrix
level_distance <- distance_mat
pcoa = cmdscale (level_distance,eig=TRUE,k =3)
# cmdscale函数是一个用于计算多维尺度变换的函数(The cmdscale function is a function used to calculate multidimensional scaling transformations.)
level.pcoa = pcoa$points[,c(1,2)] 
colnames(level.pcoa)=c('PC1','PC2')
pc = round(pcoa$eig/sum(pcoa$eig)*100,digits=2)
level.pcoa = as.data.frame(level.pcoa)
level.pcoa$Sample_ID = row.names(level.pcoa)
level.pcoa = merge(metadata,level.pcoa,by='Sample_ID')
colnames(level.pcoa)[3] = 'Group2'
level.pcoa$Group2 = factor(level.pcoa$Group2,levels = c('KO','OE','WT'))

# 差异检验(Difference test)
# anosim,一般用于NMDS(anosim, generally used for NMDS)
# level.anosim = anosim(level_distance,level.pcoa[,3],permutations = 999)
# adonis,一般用于PCoA(adonis, generally used for PCoA)
level.adonis = adonis2(level_distance~level.pcoa[,3],data=level.pcoa[,c(13,14)],distance = "bray",permutations = 999)
level.adonis
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999

adonis2(formula = level_distance ~ level.pcoa[, 3], data = level.pcoa[, c(13, 14)], permutations = 999, distance = "bray")
                Df SumOfSqs      R2      F Pr(>F)    
level.pcoa[, 3]  2 0.061682 0.27855 2.8957  0.001 ***
Residual        15 0.159757 0.72145                  
Total           17 0.221439 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(p_beta_s = beta_pcoa2(distance_mat, metadata2, groupID = "Group",groupID2 = "Group",levels = c('KO','OE','WT'), ellipse = T))

p_beta_s = p_beta_s + annotate(geom = "text", x = -0.19, y = 0.15, label = "R2 = 0.27855, P = 0.002", hjust = "left", size = 3)+ 
  ggtitle("Species")+theme(plot.title = element_text(face="bold", size = 10))
ggsave(paste("results/Community_diversity_analysis/beta_diversity/beta_diversity_genus",".pdf", sep=""), p_beta_s, width=89 * 1.5, height=70 * 1.5, unit='mm')
p_beta_s

Species Taxonomic Tree(物种分类树)

# rm(list = ls())
# Load packages
# 载入软件包
library(MicrobiotaProcess)
library(dplyr) 
library(ggplot2)
library(phyloseq)
library(ggtree)
library(ggtreeExtra) 
library(ggstar)
library(forcats)
library(conflicted)
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
conflict_scout()
conflicted::conflicts_prefer(phyloseq::tax_table)

# 导入数据
# load data
sample <- read.table("data/sample.txt",check.names = F, row.names = 1, header = 1, sep = "\t")
OTU<- read.table("data/otu.txt",check.names = F, row.names = 1, header = 1, sep = "\t")
Tax <- read.table("data/tax.txt",check.names = F, row.names = 1, header = 1, sep = "\t")

# 利用phyloseq包重新构造可转换为分析的数据格式
# Reconstructing data formats that can be converted into analysis using the photoseq package
ps <- phyloseq(sample_data(sample),
               otu_table(as.matrix(OTU), taxa_are_rows=TRUE), 
               tax_table(as.matrix(Tax)))
# 转换数据格式
# Convert data format
df <- ps %>% as.MPSE()

# 物种相对丰度计算
# Calculation of relative species abundance
df %<>%
  mp_cal_abundance( # for each samples
    .abundance = RareAbundance
  ) %>%
  mp_cal_abundance( # for each groups 
    .abundance=RareAbundance,
    .group=group
  )

# 物种差异分析
# Species difference analysis
df %<>%
  mp_diff_analysis(
    .abundance = RelRareAbundanceBySample,
    .group = group,
    tip.level = "OTU",
    force = FALSE,
    relative = TRUE,
    taxa.class = "all",
    first.test.method = "kruskal.test",
    first.test.alpha = 0.05,
    p.adjust = "fdr",
    filter.p = "fdr",
    strict = TRUE,
    fc.method = "generalizedFC",
    second.test.method = "wilcox.test",
    second.test.alpha = 0.05,
    cl.min = 4,
    cl.test = TRUE,
    subcl.min = 3,
    subcl.test = TRUE,
    ml.method = "lda",# 'lda' or 'rf'
    normalization = 1e+06,
    ldascore = 2,#LDA阈值
    bootnums = 30,
    sample.prop.boot = 0.7,
    ci = 0.95,
    seed = 123,
    type = "species"
  )

# 提取结果并基于ggtree等R包进行可视化
# Extract the results and visualize them based on R packages such as ggtree
taxa.tree <- df %>% 
  mp_extract_tree(type="taxatree")
taxa.tree
'treedata' S4 object'.

...@ phylo:

Phylogenetic tree with 60 tips and 126 internal nodes.

Tip labels:
  OTU31, OTU53, OTU9, OTU23, OTU48, OTU21, ...
Node labels:
  r__root, k__Bacteria, p__Actinobacteria, p__Bacteroidetes, p__Chloroflexi,
p__Firmicutes, ...

Rooted; no branch lengths.

with the following features available:
  'nodeClass', 'nodeDepth', 'RareAbundanceBySample', 'RareAbundanceBygroup',
'LDAupper', 'LDAmean', 'LDAlower', 'Sign_group', 'pvalue', 'fdr'.

# The associated data tibble abstraction: 186 × 13
# The 'node', 'label' and 'isTip' are from the phylo tree.
    node label isTip nodeClass nodeDepth RareAbundanceBySample
   <dbl> <chr> <lgl> <chr>         <dbl> <list>               
 1     1 OTU31 TRUE  OTU               8 <tibble [12 × 4]>    
 2     2 OTU53 TRUE  OTU               8 <tibble [12 × 4]>    
 3     3 OTU9  TRUE  OTU               8 <tibble [12 × 4]>    
 4     4 OTU23 TRUE  OTU               8 <tibble [12 × 4]>    
 5     5 OTU48 TRUE  OTU               8 <tibble [12 × 4]>    
 6     6 OTU21 TRUE  OTU               8 <tibble [12 × 4]>    
 7     7 OTU57 TRUE  OTU               8 <tibble [12 × 4]>    
 8     8 OTU20 TRUE  OTU               8 <tibble [12 × 4]>    
 9     9 OTU26 TRUE  OTU               8 <tibble [12 × 4]>    
10    10 OTU25 TRUE  OTU               8 <tibble [12 × 4]>    
# ℹ 176 more rows
# ℹ 7 more variables: RareAbundanceBygroup <list>, LDAupper <dbl>,
#   LDAmean <dbl>, LDAlower <dbl>, Sign_group <chr>, pvalue <dbl>, fdr <dbl>
taxa.tree %>% 
  select(label, nodeClass, LDAupper, LDAmean, LDAlower, Sign_group, pvalue, fdr) %>%
  dplyr::filter(!is.na(fdr))
# A tibble: 184 × 8
   label nodeClass LDAupper LDAmean LDAlower Sign_group  pvalue    fdr
   <chr> <chr>        <dbl>   <dbl>    <dbl> <chr>        <dbl>  <dbl>
 1 OTU31 OTU          NA      NA       NA    <NA>       0.00523 0.0446
 2 OTU53 OTU          NA      NA       NA    <NA>       0.368   0.389 
 3 OTU9  OTU           4.39    4.33     4.27 C          0.00598 0.0446
 4 OTU23 OTU           4.02    3.96     3.88 C          0.00921 0.0446
 5 OTU48 OTU           4.48    4.42     4.34 B          0.00537 0.0446
 6 OTU21 OTU          NA      NA       NA    <NA>       0.0999  0.145 
 7 OTU57 OTU          NA      NA       NA    <NA>       0.368   0.389 
 8 OTU20 OTU          NA      NA       NA    <NA>       0.0229  0.0539
 9 OTU26 OTU          NA      NA       NA    <NA>       0.0163  0.0539
10 OTU25 OTU          NA      NA       NA    <NA>       0.0152  0.0539
# ℹ 174 more rows
p1 <- ggtree(
  taxa.tree,
  #aes(color = taxa.tree@phylo[["node.label"]]),
  layout="radial",
  size = 0.5,open.angle=15, branch.length = "none") +
  geom_hilight(data = td_filter(nodeClass == "Phylum"),
               mapping = aes(node = node, fill = label),alpha = 0.2)
p1

p12 <- p1# + ggnewscale::new_scale_fill()

proportions_df <- read.table("data/ring02.txt",check.names = F, row.names = 1, header = 1, sep = "\t")
proportions_df <- as.data.frame(proportions_df)
library(ggtext)
p2<-gheatmap(p12, 
             proportions_df, 
             offset=-0.5, width=0.15,
             colnames=FALSE) +
  scale_fill_manual(
    values = c("#FFB5C5", "#FF7256", "#EE0000","#EBF7FA","#4DBBD5FF","#FFB5C5", "#FF7256", "#EE0000"),
    name = "Proportion"
    )  +
  labs(fill="Proportion of detected") +
  theme(legend.title = element_markdown())
p2

p3 <- p2# + ggnewscale::new_scale_fill()

status_df <- read.table("data/ring03.txt",check.names = F, row.names = 1, header = 1, sep = "\t")
status_df <- as.data.frame(status_df)
p4 <- gheatmap(p3, 
               status_df, 
               offset=0.5, width=0.15,
               colnames=FALSE) + 
  scale_fill_manual(values=c("#EBF7FA","#4DBBD5FF","#FFB5C5", "#FF7256", "#EE0000","#EBF7FA","#4DBBD5FF","#FFB5C5", "#FF7256", "#EE0000"),
                    name="*bsh* gene") +
  theme(legend.title = element_markdown())
p4

p42 <- p4 + ggnewscale::new_scale_fill()

p5 <- p42 + geom_fruit(
    geom = geom_col,
    mapping = aes(
      x = LDAmean,
      fill = Sign_group,
      subset = !is.na(LDAmean)),
    orientation = "y",
    offset = 0.3,
    pwidth = 0.5,
    axis.params = list(axis = "x",
                       title = "Log10(LDA)",
                       title.height = 0.01,
                       title.size = 2,
                       text.size = 1.8,
                       vjust = 1),
    grid.params = list(linetype = 2))+
  geom_tiplab(size=2, offset=5.5)
p5

p6 <- open_tree(p5, 30) %>% rotate_tree(-76)
ggsave(filename = "results/Community_diversity_analysis/taxonomic_tree/Species_Taxonomic_Tree.pdf", plot = p6, width = 10, height = 8, units = "in", dpi = 300)
p6

Species composition-chord diagram(物种组成-和弦图)

# install.packages("circlize")
library(circlize) # 载入软件包
library(reshape2)
# 导入或构建数据
# Load data
otutab2 = read.table("data/otutab2.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)

# Phylum level
otutab_p <- otutab2[, c(2, 8:25)]

# Sum of phylum
otutab_p2 <- aggregate(.~ Phylum, data = otutab_p, sum)
rownames(otutab_p2) = otutab_p2$Phylum
otutab_p2 = otutab_p2[, -1]# 1963 species

otutab_p3 <- t(otutab_p2)
otutab_p3 <- as.data.frame(otutab_p3)
otutab_p3$group <- rownames(otutab_p3)
otutab_p3$group = gsub("[0-9]","", otutab_p3$group)

otutab_p4 <- aggregate(.~ group, data = otutab_p3, mean)
rownames(otutab_p4) <- otutab_p4$group

# 将宽数据转换为长数据
long_data01 <- melt(otutab_p4,
                  id.vars = "group",  # Specify columns that do not need to be converted
                  variable.name = "bacteria",  # New column name, used to store the original column name
                  value.name = "abundance"  # The new list name is used to store the original value
)

chordDiagram(long_data01)

circos.clear()

# Save as pdf file
pdf("results/Community_diversity_analysis/species_composition/species_composition_chord_diagram.pdf", width = 8, height = 8)
chordDiagram(long_data01,
             grid.col = c(S1 = "lightcoral", S2 = "coral2", S3 = "coral4",# The color of the ring
                          E1 = "lightgreen", E2 = "green", E3 = "green3", E4 = "green4", E5 = "olivedrab4"),
             annotationTrack = c("name", "grid"), # Display variable names and circles, no coordinate axis
             col = hcl.colors(15), # The color of the stripes
             transparency = 0.5,   # Transparency
             directional = 1,      # The direction of the strip
             link.lwd = 1,         # Width
             link.lty = 2,         # Type
             link.border = 1)      # Color
circos.clear()
dev.off()
png 
  2 

Species composition-stack bar diagram(物种组成-堆叠柱状图)

# Load package
library(ggplot2)
library(ggprism)
library(dplyr)
library(plyr)
library(dplyr)
library(ggalluvial)
library(tidyverse)
library(tidyr)
library(reshape2)
library(ggpubr)

# Set theme
mytheme = theme_bw() + theme(text = element_text(family = "sans", size = 6))+
  theme(#legend.position="none",
    legend.text = element_text(size=12),
    legend.title = element_blank(), 
    panel.background = element_blank(),
    panel.grid = element_blank(),
    axis.text.y = element_text(size=12, colour="black", family = "sans", angle = 0), 
    axis.text.x = element_text(size=12, colour="black", family = "sans", angle = 0, hjust = 0),
    axis.title= element_text(size=12),
    strip.text.x = element_text(size=12, angle = 0),
    strip.text.y = element_text(size=12, angle = 0),
    plot.title = element_text(size=12, angle = 0),
    strip.background.x = element_rect(fill = "#E5E4E2", colour = "black", size = 0.2))+
      theme(axis.text.x=element_text(angle=0,vjust=1, hjust=0.6))+
  theme(axis.line = element_line(size = 0.1, colour = "black"))

# Sum of Phylum
df3_p <- read.table(file = "data/sum_p.txt", sep = "\t", header = T, check.names = FALSE)
design <- read.table(file = "data/group.txt", sep = "\t", header = T, row.names=1)
data_p<-aggregate(.~ Phylum,data=df3_p,sum)
rownames(data_p) = data_p$Phylum
data_p = data_p[, -1]# 17
# write.csv(data_p, "results/779samples_phylum.csv")

# Decreased sort by abundance
mean_sort = data_p[(order(-rowSums(data_p))), ]
mean_sort = as.data.frame(mean_sort)
mean_sort2 = t(mean_sort)
mean_sort2 = mean_sort2[order(-mean_sort2[,1]),]
mean_sort3 = t(mean_sort2)
mean_sort3 = apply(mean_sort3, 2, function(x) x/sum(x))
mean_sort3 = as.data.frame(mean_sort3)

# Filter Top 5, and other group into low abundance (relative abundance < 1%)
other = colSums(mean_sort3[6:dim(mean_sort3)[1], ])
mean_sort3 = mean_sort3[(6 - 1):1, ]
mean_sort3 = rbind(other,mean_sort3)
rownames(mean_sort3)[1] = c("others")
mean_sort3 = as.data.frame(mean_sort3)

# Stackplot for each sample
sampFile = data.frame(sample = row.names(design), group = design$Group, 
                      row.names = row.names(design))
mean_sort3$tax = rownames(mean_sort3)
# Calculate average relative abundance for each group
mat_t = t(mean_sort3)
mat_t2 = merge(sampFile, mat_t, by = "row.names")
mat_t2 = mat_t2[,c(-1,-2)]
mat_t2 = as.data.frame(mat_t2)
mat_t2$group = as.factor(mat_t2$group)
mat_t3 = mat_t2[, -1]
mat_t3 = mutate_all(mat_t3, as.numeric)
mat_t3$group = mat_t2$group
mat_t3 = as.data.frame(mat_t3)

mat_mean2 = aggregate(.~group, data = mat_t3, FUN=function(x) mean(x))
mat_mean_final = do.call(rbind, mat_mean2)[-1,]
geno = mat_mean2$group
colnames(mat_mean_final) = geno
mean_sort=as.data.frame(mat_mean_final)

# data collation
mean_sort$tax = rownames(mean_sort)
mean_sort4 = as.data.frame(mean_sort)
mean_sort4$tax = mean_sort$tax
data_all22 = as.data.frame(melt(mean_sort4, id.vars=c("tax")))
data_all22 = data_all22[order(-data_all22$value), ]

# Plot
p_phylum01 = ggplot(data_all22, aes(x=factor(variable, levels = unique(variable)), 
                                    y = value, fill = factor(tax, levels = unique(tax)),
                           stratum = factor(tax, levels = unique(tax)), 
                           alluvium = factor(tax, levels = unique(tax)))) +
  geom_bar(stat = "identity", position = "fill", width=0.2)+
  scale_y_continuous(labels = scales::percent, expand = c(0,0)) +
  coord_cartesian(ylim = c(0,1))+
  xlab("")+
  ylab("Percentage (%)")+ theme_classic()+
  guides(fill=guide_legend(title="Phylum"))+
  theme(legend.key.size = unit(0.4, "cm"))+
  theme(text = element_text(family = "sans", size = 8))+
  theme(#legend.position="none",
    legend.text = element_text(size=12),
    legend.title = element_blank(), 
    panel.background = element_blank(),
    panel.grid = element_blank(),
    axis.text.y = element_text(size=12, colour="black", 
                               family = "sans", angle = 0), 
    axis.text.x = element_text(size=12, colour="black", 
                               family = "sans", angle = 0, hjust = 0),
    axis.title= element_text(size=12),
    strip.text.x = element_text(size=12, angle = 0),
    strip.text.y = element_text(size=12, angle = 0),
    plot.title = element_text(size=12, angle = 0),
    strip.background.x = element_rect(fill = "#E5E4E2", colour = "black", size = 0.2))+
      theme(axis.text.x=element_text(angle=0,vjust=1, hjust=0.6))+
  theme(axis.line = element_line(size = 0.1, colour = "black"))+
  scale_fill_manual(values=c("#e1abbc","#edd064","#0eb0c8","#f2ccac","#a1d5b9","#6a73cf")) +
  scale_color_manual(values=c("#e1abbc","#edd064","#0eb0c8","#f2ccac","#a1d5b9","#6a73cf"))+
  geom_col(width = 0.5, color=NA)
ggsave(paste("results/Community_diversity_analysis/species_composition/Phylum_composition_stack",".pdf", sep=""), p_phylum01, width=79 * 1.5, height=69 * 1.5, unit='mm')
p_phylum01

# Comparison of Microbial Differences under Classification Levels in Box plots
data_p2 <- data_p
mean_sort = data_p2[(order(-rowSums(data_p2))), ]#Decreasing order by relative abundance
mean_sort = as.data.frame(mean_sort)
mean_sort2 = t(mean_sort)
mean_sort2 = mean_sort2[order(-mean_sort2[,1]),]
mean_sort3 = t(mean_sort2)
mean_sort3 = apply(mean_sort3, 2, function(x) x/sum(x))
mean_sort3 = as.data.frame(mean_sort3)

# Filter Top 5, and other group into low abundance (<1%)
mean_sort3 = mean_sort3[(6 - 1):1, ]
mean_sort3 = t(mean_sort3)
mean_sort3 = as.data.frame(mean_sort3)
mean_sort3$group = rownames(mean_sort3)
mean_sort3$group = gsub("[0-9]","", mean_sort3$group)

# wilcox test, adjusted by BH method and retain adjusted-p value < 0.05
diff_phylum <- mean_sort3 %>%
    select_if(is.numeric) %>%
    map_df(~ broom::tidy(wilcox.test(. ~ group,data = mean_sort3, conf.int = TRUE)), .id = 'var')
diff_phylum$padjust <- p.adjust(diff_phylum$p.value,"BH")
write.csv(diff_phylum, "results/Community_diversity_analysis/species_composition/Phylum_selected_wilcox_test.csv")

# melt data from wide to long
data_long_m<-melt(mean_sort3, id.vars = c("group"), 
                  measure.vars = c('Firmicutes','Bacteroidetes','Proteobacteria',
                                   'Actinobacteria','Verrucomicrobia'),
                  variable.name = c('Phylum'),
                  value.name = 'value')

# Boxplot,Default significance test method was Wilcoxon Rank Sum and Signed Rank Tests
p_phylum02 <- ggplot(data_long_m,aes(x=Phylum,y=value,fill=group))+
  stat_boxplot(geom = "errorbar",width=0.4,position=position_dodge(0.8))+
  geom_boxplot(width=0.6,alpha=1,position=position_dodge(0.8), outlier.shape = NA)+mytheme+
  theme(legend.position = "top")+
  stat_compare_means(aes(group=group), method = "wilcox.test",label="p.signif")+
  labs(x = "Phylum", y = "Percentage (%)")+
  scale_y_continuous(labels = scales::percent, expand = c(0,0.1))+
  geom_jitter(aes(color=group),
              shape=21, size=0.6,alpha=0.5, 
              fill="transparent",
              position =  position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8)
              )+
  scale_fill_manual(values = c("#4e8397","#ff8066"))+
  scale_color_manual(values = c("#4e8397","#ff8066"))+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))
ggsave(paste("results/Community_diversity_analysis/species_composition/Phylum_select_wilcox",".pdf", sep=""), p_phylum02, width=109 * 1.5, height=89 * 1.5, unit='mm')
p_phylum02

library(patchwork)
p_phylum <- p_phylum01 + p_phylum02 + plot_layout(ncol = 2, widths = c(1, 4))
p_phylum

ggsave(paste("results/Community_diversity_analysis/species_composition/Phylum_stackplot_wilcox",".pdf", sep=""), p_phylum, width=159 * 1.5, height=79 * 1.5, unit='mm')

Species composition-circle strack bar diagram(物种组成-环状堆叠柱状图)

# install.packages("circlize")
library(circlize) # 载入软件包
library(reshape2)
# 导入或构建数据
# Load data
otutab2 = read.table("data/otutab2.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
# Phylum level
otutab_p <- otutab2[, c(2, 8:25)]
# Sum of phylum
otutab_p2 <- aggregate(.~ Phylum, data = otutab_p, sum)
rownames(otutab_p2) = otutab_p2$Phylum
otutab_p2 = otutab_p2[, -1]# 1963 species
otutab_p3 <- t(otutab_p2)
otutab_p3 <- as.data.frame(otutab_p3)
otutab_p3$group <- rownames(otutab_p3)
#otutab_p3$group = gsub("[0-9]","", otutab_p3$group)

# 将宽数据转换为长数据 (Convert wide to long)
long_data02 <- melt(otutab_p3,
                  id.vars = "group", 
                  variable.name = "bacteria",
                  value.name = "abundance"
)

long_data02$group2 <- long_data02$group
long_data02$group2 = gsub("[0-9]","", long_data02$group2)
# write.csv(long_data02, "results/Species_composition_circle_strack_bar_diagram.csv")

long_data03 = read.table("data/Species_composition_circle_strack_bar_diagram.txt", 
                         header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)

# 给数据集增加一列用来做x (Add a column to the data set for x)
# library(tidyverse)
# long_data01 %>% 
#   ggpubr::mutate(new_x = rep(paste0('X',formatC(1:6,width = 2,flag = 0)),each=8)) -> dat01

# 最基本的堆积柱形图 (Plot stackplot)
library(ggplot2)
ggplot(data = long_data03,aes(x=group,y=abundance,fill=bacteria))+
  geom_bar(stat = "identity",position = "fill")

# 变成环状 (circlize)
ggplot(data = long_data03,aes(x=group,y=abundance,fill=bacteria))+
  geom_bar(stat = "identity",position = "fill")+
  coord_polar()+
  theme_bw()+
  ylim(-1,NA)

# 修改细节 (Change details)
p1 <- ggplot(data = long_data03,aes(x=group,y=abundance,fill=bacteria))+
  geom_bar(stat = "identity",position = "fill")+
  coord_polar()+
  #scale_x_discrete(expand = expansion(add = 0),
  #                  labels=coalesce(long_data03$group[seq(1,348,8)],""))+
  scale_y_continuous(limits = c(-1,NA))+
  theme(axis.text.x = element_text(angle = cumsum(c(90,-rep(12,15))),
                                   vjust=0,hjust = 1),
        panel.background = element_blank(),
        axis.text.y = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank())
p1 

unique(long_data03$bacteria)
 [1] "Acidobacteria"               "Actinobacteria"             
 [3] "Armatimonadetes"             "Bacteroidetes"              
 [5] "Candidatus_Saccharibacteria" "Chlamydiae"                 
 [7] "Chloroflexi"                 "Firmicutes"                 
 [9] "Ignavibacteriae"             "Nitrospirae"                
[11] "Planctomycetes"              "Proteobacteria"             
[13] "Spirochaetes"                "Thaumarchaeota"             
[15] "Unassigned"                  "Verrucomicrobia"            
[17] NA                           
p2 <- p1 + 
  annotate(geom = "text",x=4,y=-0.1,label="Group_A",angle=-60)+
  annotate(geom = "text",x=9,y=-0.1,label="Group_B",angle=10)+
  annotate(geom = "text",x=16,y=-0.1,label="Group_C",angle=-110)+
  scale_fill_manual(values = c("Acidobacteria"="#98d09d","Actinobacteria"="#d7e698",
                               "Armatimonadetes"="#5ebcc2","Bacteroidetes"="#46a9cb",
                              "Candidatus_Saccharibacteria"="#dadada","Chlamydiae"="#fbf398",
                              "Chloroflexi"="#5791c9","Firmicutes"="#7a76b7",
                              "Ignavibacteriae"="#f7a895","Nitrospirae"="#e77381",
                              "Planctomycetes"="#945893","Proteobacteria"="#5abf7e",
                              "Spirochaetes"="#9b8191","Thaumarchaeota"="#8f888b",
                              "Unassigned"="#ffe8d2","Verrucomicrobia"="#946f5c"),
  #limits=c("EX","EW","CR","EN","VU","DD","NT","LC"),
                   name="")
p2

ggsave(paste("results/Community_diversity_analysis/species_composition/Phylum_circle_stackplot",".pdf", sep=""), p2, width=159 * 1.5, height=79 * 1.5, unit='mm')

Ternary diagram (三元图)

# 使用三元图展示三种处理的共有OTU或菌群
# Use a ternary plot to show shared OTUs or microbial communities among three treatments
# install.packages("circlize")
library(circlize)
library(reshape2)
library(ggtern)
# 导入或构建数据
# Load data
otutab2 = read.table("data/otutab2.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)

# Phylum level
otutab_p <- otutab2[, c(2, 8:25)]

# Sum of phylum
otutab_p2 <- aggregate(.~ Phylum, data = otutab_p, sum)
rownames(otutab_p2) = otutab_p2$Phylum
otutab_p2 = otutab_p2[, -1]# 1963 species

otutab_p3 <- t(otutab_p2)
otutab_p3 <- as.data.frame(otutab_p3)
otutab_p3$group <- rownames(otutab_p3)
otutab_p3$group = gsub("[0-9]","", otutab_p3$group)

otutab_p4 <- aggregate(.~ group, data = otutab_p3, mean)
rownames(otutab_p4) <- otutab_p4$group

otutab_p5 <- as.data.frame(t(otutab_p4))
otutab_p5 <- otutab_p5[-1, ]
#write.csv(otutab_p5, "results/otutab_p5.csv")
otutab_p6 = read.table("data/otutab_p5.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)

# Plot
p1 <- ggtern(data=otutab_p6,ggtern::aes(KO,OE,WT)) +geom_mask() + 
  geom_point(ggtern::aes(size=Average,color=type),alpha=1)+ggtern::theme_bw() +
  theme(axis.text=element_blank(),
        axis.ticks=element_blank())
p1

ggtern::ggsave(paste("results/Community_diversity_analysis/specific_microbe/Ternary_diagram",".pdf", sep=""), p1, width=159 * 1.5, height=79 * 1.5, unit='mm')

Total microbe + Specific microbe (共有微生物+特有微生物)

# Load packages
library(ggVennDiagram)
library(ggplot2)

# Load data
otu_tax<-read.csv("data/test_otu.csv",row.names = 1)

# Select subsets
df1<-rownames(otu_tax[1:120,])
df2<-rownames(otu_tax[50:250,])
df3<-rownames(otu_tax[200:400,])

# Venn list
Venn_data<-list(Soil=df1,Env1=df2,Micro=df3)

# Plot
p1 <- ggVennDiagram(Venn_data,edge_lty = "dashed",edge_size = 0.1)+
  scale_fill_distiller(palette = "RdBu")
p1

ggplot2::ggsave(paste("results/Community_diversity_analysis/specific_microbe/Total_microbe_Specific microbe_venn_diagram",".pdf", sep=""), p1, width=159 * 1.5, height=79 * 1.5, unit='mm')

Venn diagram(维恩图)

# 绘制Venn圈图
# install.packages("VennDiagram")
library(VennDiagram)

# Load data
otutab2 = read.table("data/otutab2.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
# Phylum level
otutab_p <- otutab2[, c(8:25)]

color1 <- scales::alpha("#99CC00",0.9)
color2 <- scales::alpha("#c77cff",0.9)
color3 <- scales::alpha("#f8766d",0.9)
color4 <- scales::alpha("#F3C300",0.8)
color5 <- scales::alpha("#FF99CC",0.7)

#label_alpha = 0去除文字标签底色;
pdf("results/Community_diversity_analysis/specific_microbe/venn_ex2.pdf",width = 8,height = 7)
ggVennDiagram(otutab_p[1:2], set_size = 8,edge_size = 1.2,label_size =6,label_alpha=0) +
  scale_fill_gradient(low="white",high =color2)
dev.off()
png 
  2 
pdf("results/Community_diversity_analysis/specific_microbe/venn_ex3.pdf",width = 8,height = 7)
ggVennDiagram(otutab_p[1:3], set_size = 8,edge_size = 1.2,label_size =6,label_alpha=0) +
  scale_fill_gradient(low="white",high =color3)
dev.off()
png 
  2 
pdf("results/Community_diversity_analysis/specific_microbe/venn_ex4.pdf",width = 8,height = 7)
ggVennDiagram(otutab_p[1:4], label_alpha=0) +
  scale_fill_gradient(low="white",high =color4 ,guide="none")
dev.off()
png 
  2 
pdf("results/Community_diversity_analysis/specific_microbe/venn_ex5.pdf",width = 8,height = 7)
ggVennDiagram(otutab_p[1:5], label_alpha=0,label_size =3) +
  scale_color_brewer(palette = "Paired")+
  scale_fill_gradient(low="white",high = color5)
dev.off()
png 
  2 
# Plot UpSet
# Load packages
library(UpSetR)
library(openxlsx)
library(RColorBrewer)
library(ggplot2)

# Set seed
set.seed(123)
# Set row numbers
n_rows <- 100
# Randomly select a number between 3 and 5 as the number of repetitions
n_rep <- sample(3:5, 1)
# 随机选择一些数字进行重复
# Randomly select some numbers to repeat
repeated_values <- sample(1:100, n_rep, replace = FALSE)
# 生成三列数据,每列包含相同的数字
# Generate three columns of data, each containing the same number
data <- data.frame(
  column1 = c(repeated_values, sample(1:100, n_rows - n_rep, replace = TRUE)),
  column2 = c(repeated_values, sample(1:100, n_rows - n_rep, replace = TRUE)),
  column3 = c(repeated_values, sample(1:100, n_rows - n_rep, replace = TRUE))
)
# Check data
head(data)
  column1 column2 column3
1      79      79      79
2      51      51      51
3      14      14      14
4      67      67      67
5      42      42      42
6      50      85      72
# nsets: 最多展示多少个集合数据(How many sets of data can be displayed at most?)
# nintersects: 展示多少交集(How many intersections are shown)
# mb.ratio:点点图和条形图的比例。(Ratio of dot plots and bar graphs.)
# order.by:交集如何排序。这里先根据freq,然后根据degree(How to sort the intersection. Here we sort by frequency first, then by degree)
# decreasing:变量如何排序。这里表示freq降序,degree升序(How to sort the variables. Here, freq is in descending order and degree is in ascending order)
# Plot
upset(fromList(data))

# Adjusted and enhanced composite plot
pdf("results/Community_diversity_analysis/specific_microbe/upset01.pdf",width = 10,height = 7)
upset(fromList(data),     
      nsets=length(data),
      nintersects=30,#显示前多少个
      sets=c("column1","column2","column3"), 
      number.angles = 0, 
      point.size=4,
      line.size=1, 
      mainbar.y.label="Intersection size", 
      main.bar.color = 'black',
      matrix.color="black",
      sets.x.label="Set size", 
      sets.bar.color=brewer.pal(3,"Set1"),
      mb.ratio = c(0.7, 0.3),
      order.by = "freq",
      text.scale=c(1.5,1.5,1.5,1.5,1.5,1), 
      shade.color="red" 
)
dev.off()
png 
  2 
# 高亮显示特定几个集合的交集
# Highlight the intersection of specific sets
pdf("results/Community_diversity_analysis/specific_microbe/upset02.pdf",width = 10,height = 7)
upset(fromList(data),     
      nsets=length(data),
      nintersects=30,
      sets=c("column1","column2","column3"), 
      number.angles = 0, 
      point.size=4,
      line.size=1,
      mainbar.y.label="Intersection size", 
      main.bar.color = 'black', 
      matrix.color="black",
      sets.x.label="Set size",
      sets.bar.color=brewer.pal(3,"Set1"),
      mb.ratio = c(0.7, 0.3),
      order.by = "freq",
      text.scale=c(1.5,1.5,1.5,1.5,1.5,1), 
      shade.color="red", 
      # 设置自己想要展示的特定组的交集
      # Set the intersection of the specific groups you want to display
      queries=list(list(query=intersects,params=list("column1","column2"),color="red",active=T),
                   list(query=intersects,params=list("column1","column3"),color="blue",active=T),
                   list(query=intersects,params=list("column1","column2","column3"),color="green",active=T)
                   )
      )
dev.off()
png 
  2 

Venn-network diagram(维恩网络图)

# Load packages
library(MetaNet)
library(pcutils)

# Load data
otutab2 = read.table("data/otutab2.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)

# Select data
otutab_p <- otutab2[, c(2, 8:25)]

# Sum of phylum
otutab_p2 <- aggregate(.~ Phylum, data = otutab_p, sum)
rownames(otutab_p2) = otutab_p2$Phylum
otutab_p2 = otutab_p2[, -1]

otutab_p3 <- t(otutab_p2)
otutab_p3 <- as.data.frame(otutab_p3)
otutab_p3$group <- rownames(otutab_p3)
otutab_p3$group = gsub("[0-9]","", otutab_p3$group)

otutab_p4 <- aggregate(.~ group, data = otutab_p3, mean)
rownames(otutab_p4) <- otutab_p4$group

otutab_p4 <- as.data.frame(t(otutab_p4))
otutab_p4 <- otutab_p4[-1, ]

pdf("results/Community_diversity_analysis/specific_microbe/venn_network01.pdf",width = 8,height = 7)
venn(otutab_p4, mode = "network")
dev.off()
png 
  2 
# Another options
# EVenn: Easy to create repeatable and editable Venn diagrams and Venn networks online
# https://mp.weixin.qq.com/s/WQXq7ssRb96ZBVNqdOYBig
# https://mp.weixin.qq.com/s/aeu_teN5DRGB6DvYfDg_QQ

Sankey diagram (桑基图)

# 安装并加载所需的R包
# Install and load the required R packages
# install.packages("remotes")
# remotes::install_github("davidsjoberg/ggsankey")
library(ggsankey)
library(ggalluvial)
library(ggplot2)

# 读取数据
# Read data
df01 <- read.table(file = "data/data_sankey3.txt", sep = "\t", header = TRUE, check.names = FALSE)
data <- df01

# 将数据转换为lodes形式
# Convert data to lodes form
df <- to_lodes_form(data[, 1:ncol(data)],
                    axes = 1:ncol(data),
                    id = "value")

# 绘制桑基图(Sankey diagram)
# Draw Sankey diagram
col <- rep(c('#0ca9ce', '#78cfe5', '#c6ecf1', '#ff6f81', '#ff9c8f', '#ffc2c0', '#d386bf',
             '#cdb1d2', '#fae6f0', '#eb6fa6', '#ff88b5', '#00b1a5', "#ffa68f", "#ffca75", "#97bc83", "#acd295",
             "#00ada1", "#009f93", "#ace2da", "#448c99", "#00b3bc", "#b8d8c9", "#db888e", "#e397a4", "#ead0c7",
             "#8f9898", "#bfcfcb"), 6)

p1 <- ggplot(df, ggplot2::aes(x = x, fill = stratum, label = stratum,
                     stratum = stratum, alluvium = value), width = 0.1) +
  geom_flow(width = 0.1,
            curve_type = "sine",
            alpha = 0.6,
            color = 'white',
            size = 0.05) +
  geom_stratum(width = 0.1, color = "white") +
  geom_text(stat = 'stratum', size = 2.5, color = 'black') +
  scale_fill_manual(values = col) +
  ggplot2::theme_void() +
  theme(legend.position = 'none')

# 保存绘图结果
# Save the plot
ggplot2::ggsave(filename = "results/Community_diversity_analysis/sankey_plot/sankey_plot01.pdf", plot = p1, width = 7, height = 5, useDingbats = FALSE, limitsize = FALSE)

# 显示绘图结果
# Display the plot
p1

2.Difference analysis (2.差异分析)

Wilcox test(Batch wilcox test)

library(dplyr)
library(ggalluvial)
library(tidyverse)
library(tidyr)
library(reshape2)
library(ggpubr)

# sum of Phylum
df3_p <- read.table(file = "data/sum_p.txt", sep = "\t", header = T, check.names = FALSE)
design <- read.table(file = "data/group.txt", sep = "\t", header = T, row.names=1)
data_p<-aggregate(.~ Phylum,data=df3_p,sum)
rownames(data_p) = data_p$Phylum
data_p = data_p[, -1]# 17
#write.csv(data_p, "results/779samples_phylum.csv")

# Comparison of Microbial Differences under Classification Levels in Box plots
data_p2 <- data_p
mean_sort = data_p2[(order(-rowSums(data_p2))), ]#Decreasing order by relative abundance
mean_sort = as.data.frame(mean_sort)
mean_sort2 = t(mean_sort)
mean_sort2 = mean_sort2[order(-mean_sort2[,1]),]
mean_sort3 = t(mean_sort2)
mean_sort3 = apply(mean_sort3, 2, function(x) x/sum(x))
mean_sort3 = as.data.frame(mean_sort3)

# Filter Top 5, and other group into low abundance (<1%)
#mean_sort3 = mean_sort3[(12 - 1):1, ]
mean_sort3 = t(mean_sort3)
mean_sort3 = as.data.frame(mean_sort3)
mean_sort3$group = rownames(mean_sort3)
mean_sort3$group = sub("[0-9]","_", mean_sort3$group)
mean_sort3$group = gsub("[0-9]","", mean_sort3$group)

# 去掉所有数据都是0的列
library(dplyr)
mean_sort4 <- mean_sort3 %>%
  select(where(~ any(. != 0)))

# 这里的12根据实际表中剩余数量修改
mean_sort5 <- mean_sort4[, -12]
# log10-transformation
mean_sort52 = log10(mean_sort5 + 1e-05)

# z-score standardization
mean_sort6 = apply(mean_sort52, 2, function(x){
 return((x-mean(x))/sd(x))
})
#mean_sort6 = t(mean_sort6)
mean_sort6 <- as.data.frame(mean_sort6)
mean_sort6$group <- mean_sort3$group

# wilcox test, adjusted by BH method and retain adjusted-p value < 0.05
diff_phylum <- mean_sort6 %>%
    select_if(is.numeric) %>%
    map_df(~ broom::tidy(wilcox.test(. ~ group,data = mean_sort6, conf.int = TRUE)), .id = 'var')
diff_phylum$padjust <- p.adjust(diff_phylum$p.value,"BH")
write.csv(diff_phylum, "results/Difference_analysis/Wilcox/Wilcox_test01.csv")

edgeR and DESep2-Volcano plot(edgeR和DESep2-火山图)

# edgeR
# Load packages
library(edgeR)

# Load data
# 读取数据
df_KO <- read.table(file = "data/ko_gene_data.txt", sep = "\t", header = T, check.names = FALSE)
design <- read.table(file = "data/group_KO2.txt", sep = "\t", header = T, row.names=1)

df_KO2 <- df_KO
rownames(df_KO2) <- df_KO$Gene_family
df_KO2 <- df_KO2[, -1]

design2 <- t(design)

# 构建DGEList对象
# Constructing a DGEList object
dgelist <- DGEList(counts = df_KO2, group = design2)
dgelist
An object of class "DGEList"
$counts
          Healthy01    Healthy02    Healthy03    Healthy04    Healthy05
K00001  5513652.000 5.030384e+06 5.011590e+06 5.071771e+06 5.070956e+06
K00002 26183663.270 2.746872e+07 2.277124e+07 2.964312e+07 2.409072e+07
K00003        0.000 0.000000e+00 0.000000e+00 1.058710e+01 0.000000e+00
K00004        0.000 1.055100e+01 0.000000e+00 1.289566e+01 4.103165e+01
K00005      265.022 3.200185e+01 1.353479e+02 1.232836e+02 1.529002e+02
          Healthy06    Healthy07    Healthy08    Healthy09    Healthy10
K00001 4.112742e+06 6.706504e+06  7159849.000 5.833764e+06 5.260313e+06
K00002 2.322997e+07 2.542978e+07 24574795.310 2.980676e+07 2.381957e+07
K00003 0.000000e+00 0.000000e+00        0.000 0.000000e+00 0.000000e+00
K00004 0.000000e+00 9.239522e+00        0.000 0.000000e+00 5.735490e+01
K00005 4.719408e+01 1.890843e+02      196.229 6.976540e+01 1.093376e+02
         Patients01   Patients02   Patients03   Patients04   Patients05
K00001 5.196891e+06 5.611760e+06 9.980051e+06 6.585471e+06 6.156271e+06
K00002 2.753910e+07 3.440364e+07 3.475113e+07 2.459670e+07 2.896511e+07
K00003 4.333895e+01 5.293551e+01 0.000000e+00 0.000000e+00 0.000000e+00
K00004 0.000000e+00 2.344529e+01 2.238694e+02 0.000000e+00 2.227433e+01
K00005 9.333383e+02 7.053999e+01 4.654725e+02 1.726085e+02 1.669811e+02
         Patients06   Patients07   Patients08   Patients09   Patients10
K00001 7.695370e+06 5.269730e+06 1.041806e+07 4.992085e+06 3.976257e+06
K00002 2.431583e+07 2.690927e+07 2.909897e+07 6.317151e+07 2.386349e+07
K00003 5.433096e+02 0.000000e+00 1.251203e+01 9.624639e+00 1.455884e+02
K00004 6.916764e+01 0.000000e+00 7.577184e+01 0.000000e+00 0.000000e+00
K00005 2.569143e+02 1.599218e+02 6.971007e+01 1.307975e+02 1.535119e+02
195 more rows ...

$samples
            group lib.size norm.factors
Healthy01 Healthy 31746866            1
Healthy02 Healthy 32561511            1
Healthy03 Healthy 27807172            1
Healthy04 Healthy 34762673            1
Healthy05 Healthy 29207202            1
15 more rows ...
# 过滤低表达的基因(Filtering low expressed genes)
# 相比之下,edgeR推荐根据CPM(count-per-million,每百万碱基中目标基因的read count值)值进行过滤,cpm()用于计算CPM值,使用CPM值为1作为标准,即当某个基因在read count最低的样本(文库)中的count值大于(read count最低的样品count总数/1000000),则保留。
# In contrast, edgeR recommends filtering based on CPM (count-per-million, read counts of the target gene per million bases). cpm() is used to calculate the CPM value, and a CPM value of 1 is used as the standard, that is, when the count value of a gene in the sample (library) with the lowest read count is greater than (total counts of the sample with the lowest read count/1000000), it is retained.
keep <- rowSums(cpm(dgelist) > 1 ) >= 2
dgelist <- dgelist[keep, ,keep.lib.sizes = FALSE]

# 标准化数据(Standardized)
dgelist_norm <- calcNormFactors(dgelist, method = 'TMM')

# 样本无监督聚类(Unsupervised clustering of samples)
plotMDS(dgelist_norm, col = rep(c('red', 'blue'), each = 5), dim = c(1, 2))

# 估算离散值(Estimating discrete values)
group <- design2[1,]
design <- model.matrix(~group)
dge <- estimateDisp(dgelist_norm, design, robust = TRUE)
plotBCV(dge)

# 差异基因分析(Differential gene analysis)
# negative binomial generalized log-linear model
fit <- glmFit(dge, design, robust = TRUE) 
lrt <- glmLRT(fit)   #统计检验
topTags(lrt)
Coefficient:  groupPatients 
          logFC      logCPM       LR       PValue          FDR
K00106 9.206795  0.04061104 20.81904 5.047850e-06 0.0006562206
K00180 8.505768 -0.61463819 18.18007 2.009705e-05 0.0010226081
K00185 8.341201 -0.76684000 17.44843 2.952080e-05 0.0010226081
K00089 6.647059  1.29328702 17.32721 3.146486e-05 0.0010226081
K00105 7.130335  0.33157245 15.72705 7.317012e-05 0.0016010605
K00199 5.448346  2.14159636 15.70840 7.389510e-05 0.0016010605
K00172 6.050092  1.75470139 15.15904 9.882384e-05 0.0016735281
K00109 7.188984  0.27482588 15.08115 1.029863e-04 0.0016735281
K00162 5.630761  1.93221277 14.60246 1.327409e-04 0.0018393508
K00193 6.816308 -0.23316972 14.48222 1.414885e-04 0.0018393508
#write.csv(topTags(lrt, n = nrow(dgelist$counts)), 'npc_glmQLFTest.csv', quote = FALSE) 

# DESeq2
# Load packages
library(DESeq2)
# Load data
df_KO <- read.table(file = "data/ko_gene_data.txt", sep = "\t", header = T, check.names = FALSE)
design <- read.table(file = "data/group_KO2.txt", sep = "\t", header = T, row.names=1)
design$group <- factor(design$group)

df_KO2 <- df_KO
rownames(df_KO2) <- df_KO$Gene_family
df_KO2 <- df_KO2[, -1]
identical(colnames(df_KO2), rownames(design))
[1] TRUE
rows_to_keep <- base::intersect(colnames(df_KO2), rownames(design))
group <- design[rows_to_keep,,drop=F]
df_KO3 <- df_KO2[,rows_to_keep]
identical(colnames(df_KO3), rownames(group))
[1] TRUE
colnames(group)[1] <- "Group"

# 过滤所有样本中count值为0的基因(Filter genes with count values of 0 in all samples)
df_KO4 <- df_KO3[rowSums(df_KO3)!=0, ]
DMO <- read.table(file = "data/DMO.txt", sep = "\t", header = T, row.names=1)
df_KO5 <- df_KO4[rownames(df_KO4) %in% rownames(DMO), ]

# 差异分析(Difference analysis)
dds <- DESeq2::DESeqDataSetFromMatrix(countData = round(df_KO5),
                                      colData=group,
                                      design = ~ Group)
dds_res <- DESeq2::DESeq(dds, sfType = "poscounts")

res <- results(dds_res, 
               tidy=T, 
               format="DataFrame",
               contrast = c("Group","Patients","Healthy"))
# head(res)

# 火山图(Volcano plot)
DEG<-res
logFC_cutoff<-1
DEG$change<-as.factor(ifelse(DEG$padj<0.5&abs(DEG$log2FoldChange)>logFC_cutoff,
                             ifelse(DEG$log2FoldChange>logFC_cutoff,"UP","DOWN"),
                             "NOT"))
this_title <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
                     '\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
                     '\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',]))
DEG<-na.omit(DEG)
library(ggplot2)
ggplot(data=DEG,ggplot2::aes(x=log2FoldChange,
                    y=-log10(pvalue),
                    color=change))+
  geom_point(alpha=0.8,size=3)+
  labs(x="log2 fold change")+ ylab("-log10 FDR")+
  ggtitle(this_title)+ggplot2::theme_bw(base_size = 20)+
  theme(plot.title = element_text(size=15,hjust=0.5),)+
  scale_color_manual(values=c('#a121f0','#bebebe','#ffad21')) -> p1
p1

# Using generalized fold change
# KO genes
df_KO <- read.table(file = "data/ko_gene_data.txt", sep = "\t", header = T, check.names = FALSE)
df_KO2 <- df_KO
rownames(df_KO2) <- df_KO$Gene_family
df_KO2 <- df_KO2[, -1]
df_KO3 <- apply(df_KO2, 2, function(x) x/sum(x))
df_KO4 <- df_KO3[c(-1,-2), ]
feat.all <- df_KO4
feat.all <- feat.all[, colnames(feat.all) %in% rownames(design)]

feat.all <- feat.all[rowSums(feat.all)!=0, ]

DMO <- read.table(file = "data/DMO.txt", sep = "\t", header = T, row.names=1)
feat.all2 <- feat.all[rownames(feat.all) %in% rownames(DMO), ]

design <- read.table(file = "data/group_KO2.txt", sep = "\t", header = T, row.names=1)
design$Sample_ID <- rownames(design)
meta <- design
stopifnot(all(meta$Sample_ID %in% colnames(feat.all)))

# Calculate generalized fold change
library(dplyr)
result_list <- list()
for (f in row.names(feat.all2)) {
  # other metrics
  x <- feat.all2[f, meta %>% filter(group=='Patients') %>% pull(Sample_ID)]
  y <- feat.all2[f, meta %>% filter(group=='Healthy') %>% pull(Sample_ID)]
  # FC
  q.p <- quantile(log10(x+1e-8), probs=seq(.1, .9, .05))
  q.n <- quantile(log10(y+1e-8), probs=seq(.1, .9, .05))
  fc <- sum(q.p - q.n)/length(q.p)
  #print(fc)
  result_list[f] <- list(col1 = fc)
}
result_df2 <- do.call(rbind, lapply(result_list, data.frame))
colnames(result_df2) <- c("gFC")
#cat('\n')

#res2 <- res[c(-1,-2), ]
res2 <- res
rownames(res2) <- res2$row
res2$gFC <- result_df2$gFC
res2$log2gFC <- log2(res2$gFC+1)

# 火山图(Volcano plot)
DEG2<-res2
#logFC_cutoff<-2
gFC_cutoff <- 0.5
DEG2$change2<-as.factor(ifelse(DEG2$padj<0.5&abs(DEG2$gFC)>gFC_cutoff,
                             ifelse(DEG2$gFC>gFC_cutoff,"UP","DOWN"),
                             "NOT"))
this_title <- paste0('Cutoff for gFC is ',round(gFC_cutoff,3),
                     '\nThe number of up gene is ',nrow(DEG2[DEG2$change2 =='UP',]) ,
                     '\nThe number of down gene is ',nrow(DEG2[DEG2$change2 =='DOWN',]))
DEG2<-na.omit(DEG2)
library(ggplot2)
ggplot(data=DEG2,ggplot2::aes(x=gFC,
                    y=-log10(padj),
                    color=change2))+
  geom_point(alpha=0.8,size=3)+
  labs(x="gFC")+ ylab("-log10 FDR")+
  ggtitle(this_title)+ggplot2::theme_bw(base_size = 20)+
  theme(plot.title = element_text(size=15,hjust=0.5),)+
  scale_color_manual(values=c('#a121f0','#bebebe','#ffad21')) -> p2
p2

#+xlim(NA,5)+ylim(NA,40) -> p2
write.csv(DEG2, "results/Difference_analysis/edgeR_DESeq2/KO_difference_DMO01.csv")

DESqa2-Manhattan diagram(DESqa2-曼哈顿图)

# 参考:https://mp.weixin.qq.com/s/KZa-L9Fyv-5FIU9j5vPqmQ
# rm(list=ls())
# Load packages
library(DESeq2)
library(ggplot2)
library(tidyverse)

# Load data
OTU <- read.table("data/otutab3.txt", sep = "\t",  row.names = 1,stringsAsFactors =FALSE, check.names =FALSE,header=1)
metadata <- read.delim(file = "data/metadata2.txt", sep = '\t', stringsAsFactors = FALSE)
metadata$Group <- as.factor(metadata$Group)

tax <- cbind(rownames(OTU), OTU[, 1:7])
colnames(tax)[1] <- "OTU"
tax <- as.data.frame(tax)

# Select Phylum level data
OTU_phylum <- OTU[, c(2, 8:25)]
# sum of Phylum
OTU_phylum <- aggregate(.~ Phylum, data = OTU_phylum, sum)
rownames(OTU_phylum) = OTU_phylum$Phylum
OTU_phylum = OTU_phylum[, -1]# 1963 species

# Assign the extracted door-level information back to a new column of the tax data frame
tax$phylum <- tax$Phylum

# Filter OTUs with relative abundance below the threshold
otu_relative <- apply(OTU_phylum, 2, function(x){x/sum(x)})
threshold = 0.0005
idx <- rowSums(otu_relative > threshold) >= 1
otu <- as.data.frame(OTU_phylum[idx, ])
otu_relative <- as.data.frame(otu_relative[idx, ])

# DESeq2 differential expression analysis
# Constructing a DESeqDataSet object
dds <- DESeqDataSetFromMatrix(countData = otu, colData = metadata, design = ~Group) 

# Normalize the original dds
dds <- DESeq(dds)

# Use the results() function in the DESeq2 package to extract the results of the differential analysis
group='Group'
treatment = 'KO'
control ='OE'
res <- results(dds, contrast=c(group, control, treatment))

# Use the order() function to sort the result res by pvalue value
res = res[order(res$pvalue),]
res
log2 fold change (MLE): Group OE vs KO 
Wald test p-value: Group OE vs KO 
DataFrame with 15 rows and 6 columns
                 baseMean log2FoldChange     lfcSE       stat      pvalue
                <numeric>      <numeric> <numeric>  <numeric>   <numeric>
Proteobacteria  21656.369      -1.035522  0.218080   -4.74835 2.05079e-06
Planctomycetes     87.803       1.434603  0.322774    4.44460 8.80552e-06
Actinobacteria  10561.924      -0.591404  0.169149   -3.49634 4.71685e-04
Verrucomicrobia   104.606       0.851992  0.255028    3.34078 8.35420e-04
Acidobacteria     160.148       0.606024  0.287930    2.10476 3.53119e-02
...                   ...            ...       ...        ...         ...
Chlamydiae       15.95048     0.27160216  0.499050  0.5442380    0.586278
Firmicutes      685.46414     0.13970686  0.300424  0.4650319    0.641909
Spirochaetes     47.14644    -0.10156322  0.362492 -0.2801803    0.779339
Ignavibacteriae   5.38696     0.08623627  0.872991  0.0987826    0.921311
Nitrospirae      31.29710    -0.00496568  0.396255 -0.0125315    0.990002
                       padj
                  <numeric>
Proteobacteria  3.07619e-05
Planctomycetes  6.60414e-05
Actinobacteria  2.35843e-03
Verrucomicrobia 3.13283e-03
Acidobacteria   1.05936e-01
...                     ...
Chlamydiae         0.799470
Firmicutes         0.802386
Spirochaetes       0.899238
Ignavibacteriae    0.987119
Nitrospirae        0.990002
summary(res)

out of 15 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 2, 13%
LFC < 0 (down)     : 2, 13%
outliers [1]       : 0, 0%
low counts [2]     : 0, 0%
(mean count < 5)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
# Save results
res <- data.frame(res, stringsAsFactors = FALSE, check.names = FALSE)
write.table(res, 'results/Difference_analysis/edgeR_DESeq2/control_treat.DESeq2.txt', col.names = NA, sep = '\t', quote = FALSE)

# Screening of differential OTUs
res1 <- res[order(res$padj, res$log2FoldChange, decreasing = c(FALSE, TRUE)), ]

# log2FC≥1 & padj<0.01 indicates up, representing significantly up-regulated OTUs
# log2FC≤-1 & padj<0.01 indicates down, representing significantly down-regulated OTUs.
# The remaining symbols are none, representing non-differential OTUs.
res1[which(res1$log2FoldChange >= 1 & res1$padj < 0.01),'sig'] <- 'up'
res1[which(res1$log2FoldChange <= -1 & res1$padj < 0.01),'sig'] <- 'down'
res1[which(abs(res1$log2FoldChange) <= 1 | res1$padj >= 0.01),'sig'] <- 'none'
write.table(res1, file = 'results/Difference_analysis/edgeR_DESeq2/control_treat.txt', sep = '\t', col.names = NA, quote = FALSE)

# Output the selected differentially expressed genes list
res1_select <- subset(res1, sig %in% c('up', 'down'))
write.table(res1_select, file = 'results/Difference_analysis/edgeR_DESeq2/control_treat.DESeq2.select.txt', sep = '\t', col.names = NA, quote = FALSE)

# Separate output according to up and down
res1_up <- subset(res1, sig == 'up')
res1_down <- subset(res1, sig == 'down')

write.table(res1_up, file = 'results/Difference_analysis/edgeR_DESeq2/control_treat.DESeq2.up.txt', sep = '\t', col.names = NA, quote = FALSE)
write.table(res1_down, file = 'results/Difference_analysis/edgeR_DESeq2/control_treat.DESeq2.down.txt', sep = '\t', col.names = NA, quote = FALSE)

# Extract differential OTUs and merge them
# Check the number of P<0.05 after FDR correction
table(res1$padj<0.05) 

FALSE  TRUE 
   11     4 
# Extract differential OTU
diff_OTU_deseq2 <-subset(res1, padj < 0.01 & abs(log2FoldChange) > 1)
dim(diff_OTU_deseq2)
[1] 2 7
head(diff_OTU_deseq2)
                baseMean log2FoldChange     lfcSE      stat       pvalue
Proteobacteria 21656.369      -1.035522 0.2180802 -4.748354 2.050794e-06
Planctomycetes    87.803       1.434603 0.3227742  4.444601 8.805515e-06
                       padj  sig
Proteobacteria 3.076190e-05 down
Planctomycetes 6.604136e-05   up
write.csv(diff_OTU_deseq2, file= paste("results/Difference_analysis/edgeR_DESeq2/DEOTU_",control,"_vs_",treatment,".csv"))

# Calculate the average abundance value
abundance<-aggregate(t(otu_relative),by=list(metadata$Group),FUN=mean)
abundance<-as.data.frame(t(abundance))
colnames(abundance)<-abundance[1,]
abundance<-abundance[-1,]
abundance<-as.data.frame(lapply(abundance, as.numeric))
res1$abundance <- apply(cbind(abundance$KO,abundance$OE), 1, function(x){mean(x)})

# Merge data
data <- merge(as.data.frame(res1), res1,by="row.names",sort=FALSE,all=F)
# Filter relative abundance below 0.01%
data<-data[(data$abundance.x>0.0001),]
# Convert data to character type
rownames(data) <- as.character(data$Row.names)
tax$OTU <- as.character(tax$OTU)
# Use the match function to find the position of Row.names in the OTU column of tax in the data data frame
indexes <- match(data$Row.names, tax$OTU)
# Use these positional indices to extract information from the phylum column in the tax data frame.
data$phylum <- tax$phylum[indexes]
data$phylum <- data$Row.names
# Convert Pvalue to negative logarithm
data$neglogp = -log(data$pvalue.x)
data<-as.data.frame(cbind(data$Row.names, data$log2FoldChange.x, data$pvalue.x, data$phylum,data$abundance.x, data$neglogp))
colnames(data)<-c("otu","log2FoldChange","pvalue","phylum","Abundance","neglogp")
# Change the data type to numeric
data<-transform(data, Abundance = as.numeric(Abundance), neglogp = as.numeric(neglogp), pvalue= as.numeric(pvalue))
# Marker difference OTU type
data$level = as.factor(ifelse(data$pvalue>=0.05, "nosig", ifelse(data$pvalue<0.05&data$log2FoldChange<0, "enriched","depleted")))
# Save results
write.csv(data, file= paste("results/Difference_analysis/edgeR_DESeq2/OTU",control,"_vs_",treatment,".csv"))


# Manhattan plot
label = unique(data$phylum)
label = label[!(label %in% "Low Abundance")] # Delete low abundance

# Set order
data$phylum = factor(data$phylum, levels = c(label, "Low Abundance"))
data$level = factor(data$level, levels = c("enriched","depleted","nosig"))

# Plot
#data[data$neglogp>30,]$neglogp  = 30
Title=paste("Differential OTU in ",control," vs ",treatment)

# OTU as X-axis
p <- ggplot(data, ggplot2::aes(x=otu, y=neglogp, color=phylum, shape=level, size=Abundance)) +
  geom_hline(yintercept=-log(0.05), linetype=2, color="lightgrey") +
  geom_point(alpha=.6,position=position_jitter(0.5),stroke=2) +
  scale_shape_manual(values=c(17,25, 20))+
  scale_size(breaks=c(5,10,15))+
  labs(x="OTU", y="-log10(P)",title=Title)+
  ggplot2::theme_classic()+ 
  theme(axis.ticks.x=element_blank(),axis.text.x=element_blank(),
    legend.position="top", 
    panel.grid = element_blank()) 
p

# Set color
library(RColorBrewer)
brewer.pal.info
         maxcolors category colorblind
BrBG            11      div       TRUE
PiYG            11      div       TRUE
PRGn            11      div       TRUE
PuOr            11      div       TRUE
RdBu            11      div       TRUE
RdGy            11      div      FALSE
RdYlBu          11      div       TRUE
RdYlGn          11      div      FALSE
Spectral        11      div      FALSE
Accent           8     qual      FALSE
Dark2            8     qual       TRUE
Paired          12     qual       TRUE
Pastel1          9     qual      FALSE
Pastel2          8     qual      FALSE
Set1             9     qual      FALSE
Set2             8     qual       TRUE
Set3            12     qual      FALSE
Blues            9      seq       TRUE
BuGn             9      seq       TRUE
BuPu             9      seq       TRUE
GnBu             9      seq       TRUE
Greens           9      seq       TRUE
Greys            9      seq       TRUE
Oranges          9      seq       TRUE
OrRd             9      seq       TRUE
PuBu             9      seq       TRUE
PuBuGn           9      seq       TRUE
PuRd             9      seq       TRUE
Purples          9      seq       TRUE
RdPu             9      seq       TRUE
Reds             9      seq       TRUE
YlGn             9      seq       TRUE
YlGnBu           9      seq       TRUE
YlOrBr           9      seq       TRUE
YlOrRd           9      seq       TRUE
display.brewer.all(type="qual") 

brewer.pal(15, 'Dark2')
[1] "#1B9E77" "#D95F02" "#7570B3" "#E7298A" "#66A61E" "#E6AB02" "#A6761D"
[8] "#666666"
# Set color
p <- p+scale_color_manual(values = c("#1B9E77", "#D95F02", "#7570B3" ,"#E7298A", "#66A61E","#E6AB02" ,"#A6761D",
                                     "#5ebcc2", "#1b868c","#46a9cb", "#5791c9" ,"#7a76b7", "#945893","#9c3d62" ,"#882100"))  
p

# Save color
ggplot2::ggsave("results/Difference_analysis/edgeR_DESeq2/man_otu.pdf", p, width = 9, height = 4)

# Phylum level data as X-axis
p<-ggplot(data, ggplot2::aes(x=phylum, y=neglogp, color=phylum, shape=level, size=Abundance)) +
  geom_hline(yintercept=-log(0.05), linetype=2, color="lightgrey") +
  geom_point(alpha=.6,position=position_jitter(0.5),stroke=2) +
  scale_shape_manual(values=c(17, 25, 20))+
  scale_size(breaks=c(5,10,20))+
  labs(x=NULL, y="-log10(P)",title=Title)+
  ggplot2::theme_classic()+ 
  theme(legend.position="top", 
    panel.grid = element_blank(), 
    axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5)
  )
p

library(RColorBrewer)
brewer.pal.info
         maxcolors category colorblind
BrBG            11      div       TRUE
PiYG            11      div       TRUE
PRGn            11      div       TRUE
PuOr            11      div       TRUE
RdBu            11      div       TRUE
RdGy            11      div      FALSE
RdYlBu          11      div       TRUE
RdYlGn          11      div      FALSE
Spectral        11      div      FALSE
Accent           8     qual      FALSE
Dark2            8     qual       TRUE
Paired          12     qual       TRUE
Pastel1          9     qual      FALSE
Pastel2          8     qual      FALSE
Set1             9     qual      FALSE
Set2             8     qual       TRUE
Set3            12     qual      FALSE
Blues            9      seq       TRUE
BuGn             9      seq       TRUE
BuPu             9      seq       TRUE
GnBu             9      seq       TRUE
Greens           9      seq       TRUE
Greys            9      seq       TRUE
Oranges          9      seq       TRUE
OrRd             9      seq       TRUE
PuBu             9      seq       TRUE
PuBuGn           9      seq       TRUE
PuRd             9      seq       TRUE
Purples          9      seq       TRUE
RdPu             9      seq       TRUE
Reds             9      seq       TRUE
YlGn             9      seq       TRUE
YlGnBu           9      seq       TRUE
YlOrBr           9      seq       TRUE
YlOrRd           9      seq       TRUE
display.brewer.all(type="qual") 

brewer.pal(7, 'Dark2')
[1] "#1B9E77" "#D95F02" "#7570B3" "#E7298A" "#66A61E" "#E6AB02" "#A6761D"
# Set color
p <- p+scale_color_manual(values = c("#1B9E77", "#D95F02", "#7570B3" ,"#E7298A", "#66A61E","#E6AB02" ,"#A6761D",
                                     "#5ebcc2", "#1b868c","#46a9cb", "#5791c9" ,"#7a76b7", "#945893","#9c3d62" ,"#882100")) 
p

ggplot2::ggsave("results/Difference_analysis/edgeR_DESeq2/man_otu2.pdf", p, width = 9, height = 6)

STAMP_difference analysis(STAMP_差异分析)

## Load data
data <- read.table("data/KEGG_L2.txt",header = TRUE,row.names = 1,sep = "\t")
group <- read.table("data/group_stamp.txt",header = FALSE,sep = "\t")
library(tidyverse)
data <- data*100
data <- data %>% filter(apply(data,1,mean) > 1)
data <- t(data)
data1 <- data.frame(data,group$V2)
colnames(data1) <- c(colnames(data),"Group")
data1$Group <- as.factor(data1$Group)

## t-test
diff <- data1 %>% 
    select_if(is.numeric) %>%
    map_df(~ broom::tidy(t.test(. ~ Group,data = data1)), .id = 'var')

diff$p.value <- p.adjust(diff$p.value,"bonferroni")
diff <- diff %>% filter(p.value < 0.05)

## wilcox
library(tidyverse)
diff1 <- data1 %>% 
    select_if(is.numeric) %>%
    map_df(~ broom::tidy(wilcox.test(. ~ Group,data = data1)), .id = 'var')

diff1$p.value <- p.adjust(diff1$p.value,"bonferroni")
diff1 <- diff %>% filter(p.value < 0.05)

## Drawing data construction
## Left bar chart
abun.bar <- data1[,c(diff$var,"Group")] %>% 
    gather(variable,value,-Group) %>% 
    group_by(variable,Group) %>% 
    dplyr::summarise(Mean = mean(value))

## Scatter plot on the right
diff.mean <- diff[,c("var","estimate","conf.low","conf.high","p.value")]
diff.mean$Group <- c(ifelse(diff.mean$estimate >0,levels(data1$Group)[1],
                            levels(data1$Group)[2]))
diff.mean <- diff.mean[order(diff.mean$estimate,decreasing = TRUE),]

## Left bar chart
library(ggplot2)
#cbbPalette <- c("#E69F00", "#56B4E9")
cbbPalette <- c("#5791c9", "#5ebcc2")
abun.bar$variable <- factor(abun.bar$variable,levels = rev(diff.mean$var))
p1 <- ggplot(abun.bar,ggplot2::aes(variable,Mean,fill = Group)) +
    scale_x_discrete(limits = levels(diff.mean$var)) +
    coord_flip() +
    xlab("") +
    ylab("Mean proportion (%)") +
    theme(panel.background = element_rect(fill = 'transparent'),
          panel.grid = element_blank(),
          axis.ticks.length = unit(0.4,"lines"), 
          axis.ticks = element_line(color='black'),
          axis.line = element_line(colour = "black"),
          axis.title.x=element_text(colour='black', size=12,face = "bold"),
          axis.text=element_text(colour='black',size=10,face = "bold"),
          legend.title=element_blank(),
          legend.text=element_text(size=12,face = "bold",colour = "black",
                                   margin = margin(r = 20)),
          legend.position = c(-1,-0.1),
          legend.direction = "horizontal",
          legend.key.width = unit(0.8,"cm"),
          legend.key.height = unit(0.5,"cm"))


for (i in 1:(nrow(diff.mean) - 1)) 
    p1 <- p1 + ggplot2::annotate('rect', xmin = i+0.5, xmax = i+1.5, ymin = -Inf, ymax = Inf, 
                        fill = ifelse(i %% 2 == 0, 'white', 'gray95'))

p1 <- p1 + 
    geom_bar(stat = "identity",position = "dodge",width = 0.7,colour = "black") +
    scale_fill_manual(values=cbbPalette)


## Scatter plot on the right
diff.mean$var <- factor(diff.mean$var,levels = levels(abun.bar$variable))
diff.mean$p.value <- signif(diff.mean$p.value,3)
diff.mean$p.value <- as.character(diff.mean$p.value)
p2 <- ggplot(diff.mean,ggplot2::aes(var,estimate,fill = Group)) +
    theme(panel.background = element_rect(fill = 'transparent'),
          panel.grid = element_blank(),
          axis.ticks.length = unit(0.4,"lines"), 
          axis.ticks = element_line(color='black'),
          axis.line = element_line(colour = "black"),
          axis.title.x=element_text(colour='black', size=12,face = "bold"),
          axis.text=element_text(colour='black',size=10,face = "bold"),
          axis.text.y = element_blank(),
          legend.position = "none",
          axis.line.y = element_blank(),
          axis.ticks.y = element_blank(),
          plot.title = element_text(size = 15,face = "bold",colour = "black",hjust = 0.5)) +
    scale_x_discrete(limits = levels(diff.mean$var)) +
    coord_flip() +
    xlab("") +
    ylab("Difference in mean proportions (%)") +
    labs(title="95% confidence intervals") 

for (i in 1:(nrow(diff.mean) - 1)) 
    p2 <- p2 + ggplot2::annotate('rect', xmin = i+0.5, xmax = i+1.5, ymin = -Inf, ymax = Inf, 
                        fill = ifelse(i %% 2 == 0, 'white', 'gray95'))

p2 <- p2 +
    geom_errorbar(ggplot2::aes(ymin = conf.low, ymax = conf.high), 
                  position = position_dodge(0.8), width = 0.5, size = 0.5) +
    geom_point(shape = 21,size = 3) +
    scale_fill_manual(values=cbbPalette) +
    geom_hline(ggplot2::aes(yintercept = 0), linetype = 'dashed', color = 'black')
    
    
p3 <- ggplot(diff.mean,ggplot2::aes(var,estimate,fill = Group)) +
    geom_text(ggplot2::aes(y = 0,x = var),label = diff.mean$p.value,
              hjust = 0,fontface = "bold",inherit.aes = FALSE,size = 3) +
    geom_text(ggplot2::aes(x = nrow(diff.mean)/2 +0.5,y = 0.85),label = "P-value (corrected)",
             srt = 90,fontface = "bold",size = 5) +
    coord_flip() +
    ylim(c(0,1)) +
    theme(panel.background = element_blank(),
          panel.grid = element_blank(),
          axis.line = element_blank(),
          axis.ticks = element_blank(),
          axis.text = element_blank(),
          axis.title = element_blank())

## Patchwork
library(patchwork)
p <- p1 + p2 + p3 + plot_layout(widths = c(4,6,2))

## Save plots
ggplot2::ggsave("results/Difference_analysis/STAMP/stamp01.pdf",p,width = 10,height = 4)

Heatmap + Bubble diagram (热图+气泡图)

# Heatmap and Bubble diagram
library(dplyr)
library(ggalluvial)
library(tidyverse)
library(tidyr)
library(reshape2)
library(ggpubr)

# sum of Phylum
df3_p <- read.table(file = "data/sum_p.txt", sep = "\t", header = T, check.names = FALSE)
design <- read.table(file = "data/group.txt", sep = "\t", header = T, row.names=1)
data_p<-aggregate(.~ Phylum,data=df3_p,sum)
rownames(data_p) = data_p$Phylum
data_p = data_p[, -1]# 17
#write.csv(data_p, "results/779samples_phylum.csv")

# Wilcox test
idx = rownames(design) %in% colnames(data_p)
metadata = design[idx, , drop = F]
data_p = data_p[, rownames(metadata)]
norm = t(t(data_p)/colSums(data_p, na = T) * 100)
idx = rowMeans(norm) > 0.01
norm = norm[idx, ]
colSums(norm)
 Healthy01  Healthy02  Healthy03  Healthy04  Healthy05  Healthy06  Healthy07 
  99.94874   99.97143  100.00000   99.99840  100.00000  100.00000  100.00000 
 Healthy08  Healthy09  Healthy10 Patients01 Patients02 Patients03 Patients04 
  99.97784   99.97669  100.00000   99.99816  100.00000  100.00000   99.98066 
Patients05 Patients06 Patients07 Patients08 Patients09 Patients10 
  99.98343  100.00000   99.96991   99.99124  100.00000   99.98941 
data_p = data_p[idx, ]
compare_pair = "Healthy-Patients"
group_list = strsplit(compare_pair, "-")[[1]]
metadata$group = metadata$Group
idx = metadata$group %in% group_list
sub_metadata = metadata[idx, , drop = F]
sub_dat = as.matrix(data_p[, rownames(sub_metadata)])
  
method = "wilcox"
idx = sub_metadata$group %in% group_list[1]
GroupA = norm[, rownames(sub_metadata[idx, , drop = F])]
idx = sub_metadata$group %in% group_list[2]
GroupB = norm[, rownames(sub_metadata[idx, , drop = F])]
nrDAF = data.frame(list = rownames(norm), row.names = rownames(norm))
for (i in 1:dim(nrDAF)[1]) {
     FC = (mean(GroupA[i, ]) + 1e-07)/(mean(GroupB[i, 
         ]) + 1e-07)
       nrDAF[i, 2] = log2(FC)
       nrDAF[i, 3] = log2(max(c(GroupA[i, ], GroupB[i, ])) * 
            10000)
       nrDAF[i, 4] = suppressWarnings(wilcox.test(as.numeric(GroupA[i, 
       ]), as.numeric(GroupB[i, ]))$p.value)
       }
nrDAF = nrDAF[, -1]
colnames(nrDAF) = c("logFC", "logCPM", "PValue")
nrDAF$FDR = p.adjust(nrDAF$PValue, method = "fdr", dim(nrDAF)[1])

nrDAF$Phylum <- rownames(nrDAF)
sub_dat2 <- as.data.frame(sub_dat)
sub_dat2$Phylum <- rownames(sub_dat2)

dt <- left_join(sub_dat2, nrDAF, by="Phylum")
dt <- as.data.frame(dt)
rownames(dt) <- dt$Phylum
dt2 <- dt[, c(1:20, 25, 22)]
# Select data for heatmap
df <- dt[,1:20]
head(df)
                            Healthy01 Healthy02 Healthy03 Healthy04 Healthy05
Actinobacteria                0.94162   0.09249   0.39559   1.75714   0.32597
Bacteroidetes                63.76353  79.16039  10.42674  49.23169  60.61995
Candidatus_Saccharibacteria   0.00000   0.00000   0.00000   0.00000   0.00000
Euryarchaeota                 0.00000   0.26306   0.00000   0.00000   0.01295
Firmicutes                   28.16631  14.50066  74.95015  38.95455  35.54828
Fusobacteria                  0.00000   0.10843   2.36739   0.31074   0.00000
                            Healthy06 Healthy07 Healthy08 Healthy09 Healthy10
Actinobacteria                0.00187   0.72349   0.28244   0.30560   0.03585
Bacteroidetes                47.94422  17.45493  68.45333  72.51388  35.58806
Candidatus_Saccharibacteria   0.00000   0.00209   0.00000   0.00000   0.00000
Euryarchaeota                 0.00000   0.00000   0.08505   0.00000   0.00000
Firmicutes                   46.17143  75.18337  25.88913  21.80918  46.70028
Fusobacteria                  1.21316   0.00000   0.03197   2.80900   0.00000
                            Patients01 Patients02 Patients03 Patients04
Actinobacteria                 1.30915    0.99638    9.03602    0.59828
Bacteroidetes                 46.64941   29.23569    7.39858   65.51137
Candidatus_Saccharibacteria    0.00000    0.00790    0.00770    0.00000
Euryarchaeota                  0.00000    0.00000    0.00000    0.00937
Firmicutes                    47.22529   43.99165   80.61333   28.36682
Fusobacteria                   0.00000    0.24494    0.00000    0.00000
                            Patients05 Patients06 Patients07 Patients08
Actinobacteria                 0.72214   11.46197    0.64551    5.04221
Bacteroidetes                 36.02456   24.39057   58.35594   20.48283
Candidatus_Saccharibacteria    0.00000    0.01242    0.00000    0.20987
Euryarchaeota                  0.00000    0.01093    0.02543    0.00000
Firmicutes                    51.26345   51.40997   30.85718   64.31857
Fusobacteria                   0.00000    0.00000    0.35127    0.00000
                            Patients09 Patients10
Actinobacteria                 2.30729    3.56827
Bacteroidetes                  3.65587    3.28257
Candidatus_Saccharibacteria    0.19536    0.00000
Euryarchaeota                  0.00000    0.00000
Firmicutes                    32.14359   69.28669
Fusobacteria                   0.00000    0.00000
# Percentage
df = apply(df, 2, function(x) x/sum(x))
# log10-transformation
df = log10(df + 1e-05)
# z-score standardization
df = apply(df, 1, function(x){
  return((x-mean(x))/sd(x))
})
df = t(df)
df = as.data.frame(df)

library(pheatmap)
library(ggplot2)
# Set color
mycol <- colorRampPalette(c("#0da9ce", "white", "#e74a32"))(100)
# Plot heatmap
p1 <- pheatmap(
  df,
  scale = 'none', 
  cluster_rows = F,
  cluster_cols = F,
  show_colnames = F,
  color = mycol
)

# Add group
group <- data.frame(type = c(rep("Healthy",10), rep("Patients",10)))
rownames(group) <- colnames(df)
group_colors <- list(type = c(Healthy = "#698e31", Patients = "#9cbe3f"))
head(group)
             type
Healthy01 Healthy
Healthy02 Healthy
Healthy03 Healthy
Healthy04 Healthy
Healthy05 Healthy
Healthy06 Healthy
group_colors
$type
  Healthy  Patients 
"#698e31" "#9cbe3f" 
# Plot as group
library("ggheatmap")
p2 <- ggheatmap(df,cluster_rows = F,cluster_cols = F,scale = "none",
                color = colorRampPalette(c("#2fa1dd", "white", "#f87669"))(100),
                annotation_cols = group,
                annotation_color = group_colors,
                text_show_cols = NULL)
p2

# Difference bubble chart drawing
dt2$phylum <- rownames(dt2) # add symbol
dt2$x <- c(" ") #Add a new column as the x-axis coordinate
# Set order
dt2$phylum <- factor(dt2$phylum,levels = rev(unique(dt2$phylum)))
# Set theme
mytheme <- ggplot2::theme_bw() +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(size = 12, angle = -45),
        axis.text.y = element_text(size = 10),
        legend.text = element_text(size = 10),
        legend.text.align = 0.5,
        legend.title = element_text(size = 12, hjust = 0))
# Plot
p3 <- ggplot(dt2, ggplot2::aes(x = x, y = phylum)) +
  geom_point(ggplot2::aes(size = logFC,
                 fill = -log10(FDR)),
             color = "black",
             shape = 21, 
             stroke = 0.8) +
  scale_size(range = c(3, 10)) +
  scale_fill_gradient(low = "#f5c6c6", high = "#cf0000") +
  mytheme+
  theme(axis.text.y = element_blank(),
        plot.background = element_blank(),
        text = element_text(size = 14))
p3

library(aplot)
p4 <- p2%>%insert_right(p3,width = 0.25)
p4

pdf("results/Difference_analysis/heatmap_bubble/heatmap_bubble_plot01.pdf", height = 7, width = 7)
p4
dev.off()
png 
  2 

Bar plot + Box plot + Bubble heatmap (热图+柱状图+箱线图)

# Comparison of bubble heat map combined with bar chart and box plot
# Load required packages
library(ggplot2)
library(dplyr)
library(scales)
library(RColorBrewer)
library(ggpubr)
library(grid)
library(paletteer)
library(cowplot)
library(magrittr)
library(stringr)
library(egg)

# Load data
dat_gene_num <- read.csv("data/fig5a_1.csv", header = TRUE)
re_gene_com <- read.csv("data/fig5a_2.csv", header = TRUE)
re_gsea <- read.csv("data/fig5a_3.csv", header = TRUE)

# Process data
dat_gene_num <- dat_gene_num %>%
  ggpubr::mutate(bug_name = factor(bug_name, levels = unique(re_gene_com$species_update)))

re_gsea <- re_gsea %>%
  ggpubr::mutate(
    go_id_description = factor(go_id_description, levels = unique(go_id_description)),
    species_update = factor(species_update, levels = rev(unique(species_update))),
    category = factor(category, levels = unique(category)),
    padj_annot1 = case_when(padj >= 0.05 & padj < 0.1 ~ "*", TRUE ~ ""),
    padj_annot2 = case_when(padj < 0.05 ~ "#", TRUE ~ "")
  )

# Define color palettes
breaklist <- seq(-1, 1, by = 0.001)
red_blue <- rev(brewer.pal(n = 11, name = "RdBu"))
col_red_blue <- colorRampPalette(red_blue)(length(breaklist))

col_panel <- c(paletteer::paletteer_d("ggsci::default_igv")[1:15], "grey70", "grey90")
names(col_panel) <- c(
  "Amino acid metabolism", "Bacterial structural components", "Cell motility",
  "DNA replication & transcription", "Fatty acid metabolism", "Genetic Rearrangement",
  "Glucose metabolism", "Phage and HGT", "Proteolysis", "Quorum sensing",
  "DNA methylation", "Signal transduction", "Stress response",
  "Virulence and antibiotic resistance", "Damaged DNA repair", "Other", "Unknown"
)

# Plot gene number bar plot
p_gene_num <- ggplot(dat_gene_num, ggplot2::aes(x = log10(gene_num), y = bug_name, fill = gene_ind)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7) +
  scale_fill_manual(values = c("#1F4690", "#B73E3E")) +
  labs(x = "T2D-associated genes") +
  ggplot2::theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_text(size = 14, face = "italic", color = "black"),
    axis.text.x = element_text(size = 8),
    plot.margin = unit(c(5.5, 1, 5.5, 0), "points"),
    legend.position = "none"
  )

# Plot boxplot
p_box <- ggplot(re_gene_com, ggplot2::aes(y = species_update, x = abs(t_statistic), color = Direction)) +
  geom_boxplot(position = position_dodge(0.7), alpha = 0.3, width = 0.7) +
  scale_color_manual(values = c("#1F4690", "#B73E3E")) +
  labs(x = "Absolute t statistic") +
  ggplot2::theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.x = element_text(size = 8),
    axis.text.y = element_blank(),
    plot.margin = unit(c(5.5, 1, 5.5, 1), "points"),
    legend.position = "none"
  )

# Plot bubble plot
p_bub <- ggplot(re_gsea, ggplot2::aes(x = go_id_description, y = species_update, color = ES, size = genenum_cat)) +
  geom_point(alpha = 0.7) +
  geom_text(ggplot2::aes(label = padj_annot1), color = "black", size = 7, nudge_y = -0.2) +
  geom_text(ggplot2::aes(label = padj_annot2), color = "black", size = 4, nudge_y = 0) +
  scale_size(range = c(8, 13), breaks = c(1, 2, 3), name = "Num. of Genes") +
  labs(x = "Species", y = "GO term") +
  scale_color_gradientn(colours = col_red_blue, name = "Enrichment score") +
  ggplot2::theme_bw() +
  theme(
    axis.text.y = element_blank(),
    axis.text.x = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title = element_text(size = 13),
    plot.margin = unit(c(5.5, 1, 5.5, 1), "points"),
    legend.position = "none"
  )

# Legend for bubble plot
p_bub_lgd <- ggplot(re_gsea, ggplot2::aes(x = go_id_description, y = species_update, color = ES, size = genenum_cat)) +
  geom_point(alpha = 0.8) +
  scale_size(range = c(8, 13), breaks = c(1, 2, 3), name = "Num. of Genes") +
  geom_text(ggplot2::aes(label = padj_annot1), color = "black", size = 7, nudge_y = -0.2) +
  geom_text(ggplot2::aes(label = padj_annot2), color = "black", size = 4, nudge_y = 0) +
  labs(x = "Species", y = "GO term") +
  scale_color_gradientn(colours = col_red_blue, name = "Enrichment score") +
  theme(
    axis.title.y = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.x = element_text(size = 13, angle = 45, face = "italic", hjust = 1, vjust = 1),
    axis.title = element_text(size = 13),
    axis.title.x = element_blank()
  )
p_bub_legend <- ggpubr::get_legend(p_bub_lgd)

pdf("results/Difference_analysis/heatmap_bar_box/legend_bubble_plot.pdf", width = 8, height = 5)
grid::grid.draw(p_bub_legend)
dev.off()
png 
  2 
# Plot category annotation
breaks_go <- as.character(unique(re_gsea$go_id_description))
labels_go <- ifelse(str_length(breaks_go) < 35, breaks_go, paste0(str_sub(breaks_go, 1, 30), "...", str_sub(breaks_go, -15, -1)))

p_cat <- ggplot(re_gsea, ggplot2::aes(y = 1, x = go_id_description)) +
  geom_tile(ggplot2::aes(fill = category), width = 1) +
  labs(fill = "Category") +
  scale_fill_manual(values = col_panel) +
  coord_cartesian(expand = FALSE) +
  scale_x_discrete(breaks = breaks_go, labels = labels_go) +
  ggplot2::theme_bw() +
  theme(
    panel.grid = element_blank(),
    axis.text.x = element_text(size = 11, angle = 30, vjust = 1, hjust = 1, color = "black"),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.title = element_blank(),
    plot.margin = unit(c(0, 5.5, 5.5, 5.5), "points"),
    legend.position = "none"
  )

# Legend for category annotation
p_cat_lgd <- ggplot(re_gsea, ggplot2::aes(x = 1, y = go_id_description)) +
  geom_tile(ggplot2::aes(fill = category), width = 1) +
  labs(fill = "Category") +
  scale_fill_manual(values = col_panel) +
  guides(fill = guide_legend(reverse = FALSE)) +
  ggplot2::theme_bw() +
  theme(
    panel.grid = element_blank(),
    axis.text.y = element_text(size = 12, color = "black"),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title = element_text(size = 13),
    axis.title.x = element_blank()
  )
cat_legend <- cowplot::get_legend(p_cat_lgd)

pdf("results/Difference_analysis/heatmap_bar_box/go_category_legend.pdf", width = 5, height = 5)
grid::grid.draw(cat_legend)
dev.off()
png 
  2 
# Combine all plots into one figure
blank <- ggplot()+ggplot2::theme_void()
pdf("results/Difference_analysis/heatmap_bar_box/heatmap_bar_box01.pdf", width = 15.5, height = 8)
egg::ggarrange(
  p_gene_num, p_box, p_bub, 
  blank, blank, p_cat,
  nrow = 2, ncol = 3,
  heights = c(5, 0.1),
  widths = c(1, 1, 6)
)
dev.off()
png 
  2 

Two-group difference analysis volcano plot(多组差异分析火山图)

#devtools::install_github("BioSenior/ggvolcano", force = TRUE)
library(ggVolcano)

# load data
# 这里使用的数据是DESeq2前面分析得到的结果(The data used here is the result of the previous analysis of DESeq2)
data_vol <-read.table("data/data_volcano.txt",header=T,sep="\t",row.names=1)
data_vol = as.data.frame(data_vol)
# use add_regulate function to add a regulate column to the DEG result data. 
data <- add_regulate(data_vol, log2FC_name = "log2FC",
                     fdr_name = "padj",log2FC = 1, fdr = 0.05)
#data <- data[, -6]
data$regulate <- data_vol$association
colnames(data)[3] <- "FoldChange"
data$padj2 <- -log10(as.numeric(data$padj))

#logFC = 0.5
#P.Value = 0.05
p_volcano1 <- ggplot(data = data, ggplot2::aes(x = FoldChange, y = padj2)) +
  geom_point(alpha = 0.4, size = 3.0, ggplot2::aes(color = regulate)) + 
  ylab("-log10(Pvalue)") +
  scale_color_manual(values = c("#74add1","#a60026", "grey")) +
#scale_color_manual(values = c("#4177a4","#b33e5b", "grey")) +
  #scale_color_manual(values = c("#81CBAB","#854888", "grey")) +
  geom_vline(xintercept = c(-0.5, 0.5), lty = 4, col = "black", lwd = 0.4) + 
  geom_hline(yintercept = -log10(0.05), lty = 4, col = "black", lwd = 0.4) + 
  labs(x = bquote(Log[2]~italic(FC)), y= bquote(atop(-Log[10]~italic(FDR))))+
  ggplot2::theme_bw()

# add labels
library(dplyr)
# select top 5 enriched species
up_data1 <- filter(data, data$regulate == "Enriched")
up_data2 <- dplyr::arrange(up_data1, dplyr::desc(up_data1$padj2))
up_data_5 <- up_data2[1:5, ] 
  
# select top 25 depleted species
down_data1 <- filter(data, data$regulate == "Depleted")
down_data2 <- dplyr::arrange(down_data1, desc(down_data1$padj2))
down_data_25 <- down_data2[1:5, ] 

# using geom_text_repel() to add labels
library(ggrepel)
p_volcano2 <- p_volcano1 +  
  geom_text_repel(data = up_data_5, ggplot2::aes(x = FoldChange, 
                                        y = padj2, 
                                        label = up_data_5$row), size = 3) + 
  geom_text_repel(data = down_data_25, ggplot2::aes(x = FoldChange, 
                                           y = padj2, 
                                           label = down_data_25$row), size = 3)+  
  theme(legend.position = c(0.84, 0.85),panel.grid = element_blank())
ggplot2::ggsave(paste("results//Difference_analysis/volcano_plot/Two_group_volcano_plot",".pdf", sep=""), 
       p_volcano2, width=100 * 1.5, height=80 * 1.5, unit='mm')
p_volcano2

Multi-group difference analysis volcano plot(多组差异分析火山图)

# Load data
otutab2 = read.table("data/otutab2.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
design <- read.delim(file = "data/metadata2.txt", sep = '\t', stringsAsFactors = FALSE)
rownames(design) <- design$SampleID

# Phylum level
otutab_p <- otutab2[, c(2, 8:25)]

# Sum of phylum
otutab_p2 <- aggregate(.~ Phylum, data = otutab_p, sum)
rownames(otutab_p2) = otutab_p2$Phylum
otutab_p2 = otutab_p2[, -1]# 1963 species

# KO vs OE
data_p01 <- otutab_p2[, 1:12]
design01 <- design[1:12, ]

# Wilcox test
idx = rownames(design01) %in% colnames(data_p01)
metadata = design01[idx, , drop = F]
data_p01 = data_p01[, rownames(metadata)]
norm = t(t(data_p01)/colSums(data_p01, na = T) * 100)
idx = rowMeans(norm) > 0.01
norm = norm[idx, ]
colSums(norm)
      KO1       KO2       KO3       KO4       KO5       KO6       OE1       OE2 
 99.99695  99.99161 100.00000 100.00000 100.00000 100.00000  99.99062  99.99084 
      OE3       OE4       OE5       OE6 
 99.99707 100.00000  99.99696  99.98102 
data_p01 = data_p01[idx, ]
compare_pair = "KO-OE"
group_list = strsplit(compare_pair, "-")[[1]]
metadata$group = metadata$Group
idx = metadata$group %in% group_list
sub_metadata = metadata[idx, , drop = F]
sub_dat = as.matrix(data_p01[, rownames(sub_metadata)])
  
method = "wilcox"
idx = sub_metadata$group %in% group_list[1]
GroupA = norm[, rownames(sub_metadata[idx, , drop = F])]
idx = sub_metadata$group %in% group_list[2]
GroupB = norm[, rownames(sub_metadata[idx, , drop = F])]
nrDAF = data.frame(list = rownames(norm), row.names = rownames(norm))
for (i in 1:dim(nrDAF)[1]) {
     FC = (mean(GroupA[i, ]) + 1e-07)/(mean(GroupB[i, 
         ]) + 1e-07)
       nrDAF[i, 2] = log2(FC)
       nrDAF[i, 3] = log2(max(c(GroupA[i, ], GroupB[i, ])) * 
            10000)
       nrDAF[i, 4] = suppressWarnings(wilcox.test(as.numeric(GroupA[i, 
       ]), as.numeric(GroupB[i, ]))$p.value)
       }
nrDAF = nrDAF[, -1]
colnames(nrDAF) = c("logFC", "logCPM", "PValue")
nrDAF$FDR = p.adjust(nrDAF$PValue, method = "fdr", dim(nrDAF)[1])

nrDAF$Phylum <- rownames(nrDAF)
nrDAF$compared_group <- "KO vs OE"


# KO vs WT
data_p02 <- otutab_p2[, c(1:6, 13:18)]
design02 <- design[c(1:6,13:18), ]

# Wilcox test
idx = rownames(design02) %in% colnames(data_p02)
metadata = design02[idx, , drop = F]
data_p02 = data_p02[, rownames(metadata)]
norm = t(t(data_p02)/colSums(data_p02, na = T) * 100)
idx = rowMeans(norm) > 0.01
norm = norm[idx, ]
colSums(norm)
      KO1       KO2       KO3       KO4       KO5       KO6       WT1       WT2 
 99.99695  99.99161 100.00000 100.00000 100.00000 100.00000  99.99190 100.00000 
      WT3       WT4       WT5       WT6 
 99.99446  99.99175  99.99458  99.99181 
data_p01 = data_p01[idx, ]
compare_pair = "KO-WT"
group_list = strsplit(compare_pair, "-")[[1]]
metadata$group = metadata$Group
idx = metadata$group %in% group_list
sub_metadata = metadata[idx, , drop = F]
sub_dat = as.matrix(data_p02[, rownames(sub_metadata)])
  
method = "wilcox"
idx = sub_metadata$group %in% group_list[1]
GroupA = norm[, rownames(sub_metadata[idx, , drop = F])]
idx = sub_metadata$group %in% group_list[2]
GroupB = norm[, rownames(sub_metadata[idx, , drop = F])]
nrDAF2 = data.frame(list = rownames(norm), row.names = rownames(norm))
for (i in 1:dim(nrDAF2)[1]) {
     FC = (mean(GroupA[i, ]) + 1e-07)/(mean(GroupB[i, 
         ]) + 1e-07)
       nrDAF2[i, 2] = log2(FC)
       nrDAF2[i, 3] = log2(max(c(GroupA[i, ], GroupB[i, ])) * 
            10000)
       nrDAF2[i, 4] = suppressWarnings(wilcox.test(as.numeric(GroupA[i, 
       ]), as.numeric(GroupB[i, ]))$p.value)
       }
nrDAF2 = nrDAF2[, -1]
colnames(nrDAF2) = c("logFC", "logCPM", "PValue")
nrDAF2$FDR = p.adjust(nrDAF2$PValue, method = "fdr", dim(nrDAF2)[1])

nrDAF2$Phylum <- rownames(nrDAF2)
nrDAF2$compared_group <- "KO vs WT"


# OE vs WT
data_p03 <- otutab_p2[, c(7:18)]
design03 <- design[c(7:18), ]

# Wilcox test
idx = rownames(design03) %in% colnames(data_p03)
metadata = design03[idx, , drop = F]
data_p03 = data_p03[, rownames(metadata)]
norm = t(t(data_p03)/colSums(data_p03, na = T) * 100)
idx = rowMeans(norm) > 0.01
norm = norm[idx, ]
colSums(norm)
      OE1       OE2       OE3       OE4       OE5       OE6       WT1       WT2 
 99.99062  99.99084  99.99707 100.00000  99.99696  99.98102  99.99190 100.00000 
      WT3       WT4       WT5       WT6 
 99.99446  99.99175  99.99458  99.99181 
data_p03 = data_p03[idx, ]
compare_pair = "OE-WT"
group_list = strsplit(compare_pair, "-")[[1]]
metadata$group = metadata$Group
idx = metadata$group %in% group_list
sub_metadata = metadata[idx, , drop = F]
sub_dat = as.matrix(data_p03[, rownames(sub_metadata)])
  
method = "wilcox"
idx = sub_metadata$group %in% group_list[1]
GroupA = norm[, rownames(sub_metadata[idx, , drop = F])]
idx = sub_metadata$group %in% group_list[2]
GroupB = norm[, rownames(sub_metadata[idx, , drop = F])]
nrDAF3 = data.frame(list = rownames(norm), row.names = rownames(norm))
for (i in 1:dim(nrDAF3)[1]) {
     FC = (mean(GroupA[i, ]) + 1e-07)/(mean(GroupB[i, 
         ]) + 1e-07)
       nrDAF3[i, 2] = log2(FC)
       nrDAF3[i, 3] = log2(max(c(GroupA[i, ], GroupB[i, ])) * 
            10000)
       nrDAF3[i, 4] = suppressWarnings(wilcox.test(as.numeric(GroupA[i, 
       ]), as.numeric(GroupB[i, ]))$p.value)
       }
nrDAF3 = nrDAF3[, -1]
colnames(nrDAF3) = c("logFC", "logCPM", "PValue")
nrDAF3$FDR = p.adjust(nrDAF3$PValue, method = "fdr", dim(nrDAF3)[1])

nrDAF3$Phylum <- rownames(nrDAF3)
nrDAF3$compared_group <- "OE vs WT"

# 合并nrDAF, nrDAF2和nrDAF3
dat <- rbind(nrDAF, nrDAF2, nrDAF3)

# 数据整理和准备
# Reshaping data
log2Foldchange=0.28
adjp=0.1
dat.plot <- dat %>% ggpubr::mutate(
  "significance"=case_when(FDR < adjp & logFC>= log2Foldchange  ~ 'up',
                     FDR < adjp &logFC<= -log2Foldchange  ~ 'down',
                     TRUE ~ 'insig'))

# Referring the levels of x axis
dat.plot$compared_group <- factor(dat.plot$compared_group,
                                  levels = c("KO vs OE",
                                             "KO vs WT",
                                             "OE vs WT"))

# 设置背景及需要标记的数据
# Reshaping data for geom_col function
top_marker=5
background.dat <- data.frame(
  dat.plot %>% group_by(compared_group) %>% filter(logFC>0) %>% 
    dplyr::summarise("y.localup"=max(logFC)),
  dat.plot %>% group_by(compared_group) %>% filter(logFC<0) %>% 
    dplyr::summarise("y.localdown"=min(logFC)),
  x.local=seq(1:length(unique(dat.plot$compared_group)))
) %>% select(-compared_group.1)
names(background.dat)
[1] "compared_group" "y.localup"      "y.localdown"    "x.local"       
x.number <- background.dat %>% select(compared_group,x.local) 
dat.plot <- dat.plot%>% left_join(x.number,by = "compared_group")
names(dat.plot)
[1] "logFC"          "logCPM"         "PValue"         "FDR"           
[5] "Phylum"         "compared_group" "significance"   "x.local"       
# selecting top-up and top-down proteins
dat.marked.up <- dat.plot %>% filter(significance=="up") %>% 
  group_by(compared_group) %>% plyr::arrange(-logFC) %>% 
  top_n(top_marker,abs(logFC))
dat.marked.down <- dat.plot %>% filter(significance=="down") %>% 
  group_by(compared_group) %>% plyr::arrange(logFC) %>% 
  top_n(top_marker,abs(logFC))
dat.marked <- dat.marked.up %>% bind_rows(dat.marked.down)
# referring group information data
dat.infor <- background.dat %>% 
  ggpubr::mutate("y.infor"=rep(0,length(compared_group)))
names(dat.infor)
[1] "compared_group" "y.localup"      "y.localdown"    "x.local"       
[5] "y.infor"       
# Plot
max_overlaps=10
vol.plot <- ggplot()+
  geom_col(background.dat,mapping=ggplot2::aes(x.local,y.localup),
           fill="grey50",alpha=0.2,width=0.9,just = 0.5)+
  geom_col(background.dat,mapping=ggplot2::aes(x.local,y.localdown),
           fill="grey50",alpha=0.2,width=0.9,just = 0.5)+
  geom_jitter(dat.plot,mapping=ggplot2::aes(x.local,logFC,
                                   color=significance,
                                   fill=significance),
              size=1.5,width = 0.4,alpha= 0.4)+
  scale_color_manual(values = c("#82677e","#eaebea","#59829e"))+
  # scale_color_manual(values = c("#5390b5","#46a9cb","#eaebea","#ffe8d2","#c97aaa","#d56e5e"))+
  geom_tile(dat.infor,mapping=ggplot2::aes(x.local,y.infor,fill=compared_group,
                                  color = compared_group),
            height=log2Foldchange*1.5,
            #color = color.pals[1:length(unique(dat.plot$compared_group))],
            color = unique(dat.plot$compared_group),
            #fill = color.pals[1:length(unique(dat.plot$compared_group))],
            #fill = unique(dat.plot$compared_group),
            alpha = 0.6,
            width=0.9)+guides(size=guide_legend(title="Count"))+ 
  labs(x=NULL,y="log2 Fold change")+
  geom_text(dat.infor,mapping=ggplot2::aes(x.local,y.infor,label=compared_group))+
  ggrepel::geom_label_repel(dat.marked.up,mapping=ggplot2::aes(x.local,logFC,label=Phylum,color=significance),
                            force = 2,size=2, 
                            max.overlaps = max_overlaps,
                            seed = 233,
                            min.segment.length = 0,
                            force_pull = 2,
                            box.padding = 0.1,
                            segment.linetype = 3, 
                            segment.color = 'black', 
                            segment.alpha = 0.5, 
                            direction = "x", 
                            hjust = 0.5)+
  ggrepel::geom_label_repel(dat.marked.down,mapping=ggplot2::aes(x.local,logFC,label=Phylum,color=significance),
                            force = 2,size=2, 
                            max.overlaps = max_overlaps,
                            seed = 233,
                            min.segment.length = 0,
                            force_pull = 2,
                            box.padding = 0.1,
                            segment.linetype = 3, 
                            segment.color = 'black', 
                            segment.alpha = 0.5, 
                            direction = "x", 
                            hjust = 0.5)+
  ggplot2::annotate("text", x=1.5, y=max(background.dat$y.localup)+2, 
           label=paste0("|log2FC|>=",log2Foldchange," & FDR<0.05"))+
  ggplot2::theme_classic()+
  theme(
    legend.spacing.x=unit(0.1,'cm'),
    legend.key.width=unit(0.5,'cm'),
    legend.key.height=unit(0.5,'cm'),
    legend.background=element_blank(),
    legend.box="horizontal",
    legend.position = c(0.15,0.72),legend.justification = c(1,0)
    )
vol.plot

#saving plot as png and pdf
ggplot2::ggsave('results/Difference_analysis/volcano_plot/Multi_group_vol01.pdf', width = 9, height = 5)
print(vol.plot)

dev.off() 
pdf 
  3 
## Another way to draw
# Data preparation
dat <- rbind(nrDAF, nrDAF2, nrDAF3)
# reshaping data
log2Foldchange=0.28
adjp=0.1
dat.plot <- dat %>% ggpubr::mutate(
  "significance"=case_when(FDR < adjp & logFC>= log2Foldchange  ~ 'up',
                     FDR < adjp &logFC<= -log2Foldchange  ~ 'down',
                     TRUE ~ 'insig'))

# Referring the levels of x axis
dat.plot$compared_group <- factor(dat.plot$compared_group,
                                  levels = c("KO vs OE",
                                             "KO vs WT",
                                             "OE vs WT"))
dat.plot$ID = row.names(dat.plot)
datv = dat.plot
datv$group <- datv$compared_group
# The for loop selects the top 5 gene symbols of each cluster
tm.g <- function(data){
  id = data$group %>% unique()
  for (i in 1:length(id)) {
      tem = filter(data,group==id[i],significance != "insig") %>% 
        distinct(ID,.keep_all = TRUE) %>% 
        top_n(5,abs(logFC))
      if (i == 1) {
        tem2 = tem
      } else {
        tem2 = rbind(tem2,tem)
      }
    }
    return(tem2)
  }
top <- tm.g(datv)
# First draw the background column, and determine it according to the max and min values of the log2FC data.
head(datv)
                                 logFC    logCPM      PValue        FDR
Acidobacteria               -1.3502738 13.384400 0.015151515 0.04329004
Actinobacteria              -0.1621734 18.628042 0.132034632 0.16504329
Armatimonadetes             -1.6364291  9.019031 0.002164502 0.01623377
Bacteroidetes               -0.5878856 16.422834 0.064935065 0.08854782
Candidatus_Saccharibacteria  0.0291015 10.086527 1.000000000 1.00000000
Chlamydiae                  -1.0089896  9.802508 0.025974026 0.04329004
                                                 Phylum compared_group
Acidobacteria                             Acidobacteria       KO vs OE
Actinobacteria                           Actinobacteria       KO vs OE
Armatimonadetes                         Armatimonadetes       KO vs OE
Bacteroidetes                             Bacteroidetes       KO vs OE
Candidatus_Saccharibacteria Candidatus_Saccharibacteria       KO vs OE
Chlamydiae                                   Chlamydiae       KO vs OE
                            significance                          ID    group
Acidobacteria                       down               Acidobacteria KO vs OE
Actinobacteria                     insig              Actinobacteria KO vs OE
Armatimonadetes                     down             Armatimonadetes KO vs OE
Bacteroidetes                       down               Bacteroidetes KO vs OE
Candidatus_Saccharibacteria        insig Candidatus_Saccharibacteria KO vs OE
Chlamydiae                          down                  Chlamydiae KO vs OE
tem = datv %>% group_by(group) %>% dplyr::summarise(max = max(logFC),min = min(logFC)) %>% as.data.frame()
col1<-data.frame(x=tem$group,
                   y=tem$max)
col2<-data.frame(x=tem$group,
                   y=tem$min)
# Draw the background column
p1 <- ggplot()+
    geom_col(data = col1,
             mapping = ggplot2::aes(x = x,y = y),
             fill = "#dcdcdc",alpha = 0.6)+
    geom_col(data = col2,
             mapping = ggplot2::aes(x = x,y = y),
             fill = "#dcdcdc",alpha = 0.6)
p1

# Overlay the scatter volcano map on the background column
head(datv)
                                 logFC    logCPM      PValue        FDR
Acidobacteria               -1.3502738 13.384400 0.015151515 0.04329004
Actinobacteria              -0.1621734 18.628042 0.132034632 0.16504329
Armatimonadetes             -1.6364291  9.019031 0.002164502 0.01623377
Bacteroidetes               -0.5878856 16.422834 0.064935065 0.08854782
Candidatus_Saccharibacteria  0.0291015 10.086527 1.000000000 1.00000000
Chlamydiae                  -1.0089896  9.802508 0.025974026 0.04329004
                                                 Phylum compared_group
Acidobacteria                             Acidobacteria       KO vs OE
Actinobacteria                           Actinobacteria       KO vs OE
Armatimonadetes                         Armatimonadetes       KO vs OE
Bacteroidetes                             Bacteroidetes       KO vs OE
Candidatus_Saccharibacteria Candidatus_Saccharibacteria       KO vs OE
Chlamydiae                                   Chlamydiae       KO vs OE
                            significance                          ID    group
Acidobacteria                       down               Acidobacteria KO vs OE
Actinobacteria                     insig              Actinobacteria KO vs OE
Armatimonadetes                     down             Armatimonadetes KO vs OE
Bacteroidetes                       down               Bacteroidetes KO vs OE
Candidatus_Saccharibacteria        insig Candidatus_Saccharibacteria KO vs OE
Chlamydiae                          down                  Chlamydiae KO vs OE
p2 <- ggplot()+
    geom_col(data = col1,
             mapping = ggplot2::aes(x = x,y = y),
             fill = "#dcdcdc",alpha = 0.6)+
    geom_col(data = col2,
             mapping = ggplot2::aes(x = x,y = y),
             fill = "#dcdcdc",alpha = 0.6)+
    geom_jitter(data = datv,
                ggplot2::aes(x =group , y = logFC, color =significance ),
                size = 1,
                width =0.4)+
    scale_color_manual(name=NULL,
                       values = c("#4393C3","grey40","#FC4E2A"))+
    labs(x="",y="log2(FoldChange)")
p2

# Add grouped color block labels to the X-axis
dfcol<-data.frame(x=tem$group,
                    y=0,
                    label=tem$group)
# Add grouped color block labels
dfcol$group <- tem$group

# Load color packages
library(RColorBrewer)
library(MetBrewer)
# BiocManager::install("MetBrewer")
# Set color
tile_color <- met.brewer("Thomas",length(tem$group))
  
# Inlaying color blocks in the image
p3 <- p2 + geom_tile(data = dfcol,
                       ggplot2::aes(x=x,y=y),
                       height=0.5,
                       color = "black",
                       fill = tile_color,
                       alpha = 0.6,
                       show.legend = F)+
  geom_text(data=dfcol,
              ggplot2::aes(x=x,y=y,label=group),
              size =3.5,
              color ="white") + ggplot2::theme_classic()
p3
  
library(ggrepel)
p4<-p3+geom_text_repel(
    data=top,
    ggplot2::aes(x=group,y=logFC,label=ID),
    force = 1.2,
    arrow = arrow(length = unit(0.008, "npc"),
                  type = "open", ends = "last"))
p4

# Remove background
p5 <- p4+
    ggplot2::theme_minimal()+
    theme(
      axis.title = element_text(size = 18,
                                color = "black",
                                face = "bold"),
      axis.line.y = element_line(color = "black",
                                 size = 1.2),
      axis.line.x = element_blank(),
      axis.text.x = element_blank(),
      panel.grid = element_blank(),
      legend.position = "top",
      legend.direction = "vertical",
      legend.justification = c(1,0),
      legend.text = element_text(size = 12)
    )
p5

# return(list(p5,p3,datv,top))
p = p3
p

filename = paste("results/Difference_analysis/volcano_plot/Mui.group.volcano.pdf",sep = "")
ggplot2::ggsave(filename,p,width = 12,height = 6,limitsize = FALSE)

3.Biomarker identification (3.生物标志物鉴别)

LEfSe analysis(LEfSe分析)

# The first way: microeco software package
# rm(list=ls())
# Load packages
library(tidyverse)
library(microeco)
library(magrittr)
library(ggplot2)

# Load data
otu <-  read.csv("data/otu.csv", row.names = 1)
group <-  read.csv("data/group.csv", row.names = 1)
tax <-  read.csv("data/tax.csv", row.names = 1)

# Create objects that the microeco package can recognize
dataset <- microtable$new(sample_table = group,
                          otu_table = otu, 
                          tax_table = tax)
dataset
microtable-class object:
sample_table have 80 rows and 2 columns
otu_table have 13262 rows and 80 columns
tax_table have 13296 rows and 7 columns
# LEfse analysis
lefse <- trans_diff$new(dataset = dataset, 
                        method = "lefse", 
                        group = "Group", 
                        alpha = 0.05, 
                        p_adjust_method = "fdr",
                        lefse_subgroup = NULL)
microtable-class object:
sample_table have 80 rows and 2 columns
otu_table have 13262 rows and 80 columns
tax_table have 13262 rows and 7 columns
# Check results
head(lefse$res_diff)
                                                            Comparison
k__Bacteria|p__Proteobacteria                        MG - NG - SG - YG
k__Bacteria|p__Acidobacteria                         MG - NG - SG - YG
k__Bacteria|p__Acidobacteria|c__Acidobacteria        MG - NG - SG - YG
k__Bacteria|p__Bacteroidetes                         MG - NG - SG - YG
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria MG - NG - SG - YG
k__Bacteria|p__Chloroflexi                           MG - NG - SG - YG
                                                                                                     Taxa
k__Bacteria|p__Proteobacteria                                               k__Bacteria|p__Proteobacteria
k__Bacteria|p__Acidobacteria                                                 k__Bacteria|p__Acidobacteria
k__Bacteria|p__Acidobacteria|c__Acidobacteria               k__Bacteria|p__Acidobacteria|c__Acidobacteria
k__Bacteria|p__Bacteroidetes                                                 k__Bacteria|p__Bacteroidetes
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria
k__Bacteria|p__Chloroflexi                                                     k__Bacteria|p__Chloroflexi
                                                     Method Group      LDA
k__Bacteria|p__Proteobacteria                         LEfSe    MG 4.933121
k__Bacteria|p__Acidobacteria                          LEfSe    YG 4.799503
k__Bacteria|p__Acidobacteria|c__Acidobacteria         LEfSe    YG 4.761262
k__Bacteria|p__Bacteroidetes                          LEfSe    NG 4.757968
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria  LEfSe    MG 4.689141
k__Bacteria|p__Chloroflexi                            LEfSe    YG 4.506116
                                                          P.unadj        P.adj
k__Bacteria|p__Proteobacteria                        9.808412e-09 1.253563e-07
k__Bacteria|p__Acidobacteria                         2.181291e-07 1.660025e-06
k__Bacteria|p__Acidobacteria|c__Acidobacteria        1.309977e-06 7.461175e-06
k__Bacteria|p__Bacteroidetes                         1.010392e-06 5.995157e-06
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 1.217453e-08 1.543416e-07
k__Bacteria|p__Chloroflexi                           6.575168e-03 1.280813e-02
                                                     Significance
k__Bacteria|p__Proteobacteria                                 ***
k__Bacteria|p__Acidobacteria                                  ***
k__Bacteria|p__Acidobacteria|c__Acidobacteria                 ***
k__Bacteria|p__Bacteroidetes                                  ***
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria          ***
k__Bacteria|p__Chloroflexi                                      *
#write.csv(efse$res_diff,"res_diff.csv",quote = FALSE,row.names = FALSE)

# Draw a histogram of the differential features of the top 30 taxa with the highest LDA (log10)
lefse_bar <- lefse$plot_diff_bar(use_number = 1:30, 
                    width = 0.8, 
                    group_order = c("YG", "MG", "NG"))
lefse_bar

ggplot2::ggsave(paste("results/Biomarker_identification/LEfSe/microeco_lefse_bar02",".pdf", sep=""), lefse_bar, width=189 * 1.5, height=150 * 1.5, unit='mm')

# Display the top 200 taxa and top 50 features
p_lefse = lefse$plot_diff_cladogram(filter_taxa = 0.0001, 
                          use_taxa_num = 200, 
                          use_feature_num = 50, 
                          clade_label_level = 5, 
                          group_order = c("YG", "MG", "NG"))
ggplot2::ggsave(paste("results/Biomarker_identification/LEfSe/microeco_cladorgam01",".pdf", sep=""), p_lefse, width=189 * 1.5, height=150 * 1.5, unit='mm')

# Change details
use_labels <- c("c__Deltaproteobacteria", "c__Actinobacteria", "o__Rhizobiales", "p__Proteobacteria", "p__Bacteroidetes", 
                "o__Micrococcales", "p__Acidobacteria", "p__Verrucomicrobia", "p__Firmicutes", 
                "p__Chloroflexi", "c__Acidobacteria", "c__Gammaproteobacteria", "c__Betaproteobacteria", "c__KD4-96",
                "c__Bacilli", "o__Gemmatimonadales", "f__Gemmatimonadaceae", "o__Bacillales", "o__Rhodobacterales")
# then use parameter select_show_labels to show
p_lefse = lefse$plot_diff_cladogram(use_taxa_num = 200, 
                          use_feature_num = 50, 
                          select_show_labels = use_labels)
ggplot2::ggsave(paste("results/Biomarker_identification/LEfSe/microeco_cladorgam02",".pdf", sep=""), p_lefse, width=189 * 1.5, height=150 * 1.5, unit='mm')


# The second method: microbiomeMarker software package
# Load packages
library(tidyverse)
library(magrittr)

# otu
otu <- read.csv("data/microbiomeMarker/otu.csv",header = TRUE,row.names = 1)
otu %<>% as.matrix() 
otu %>% head()
      CKA1 CKA2 CKA3 CKB1 CKB2 CKB3 CKC1  CKC2  CKC3 LGA1 LGA2 LGA3 LGB1 LGB2
ASV_1   14   14   17   13   10   19   10     4    12   14   12   13   15   15
ASV_2   12   18    9    0    0    5   13    15    13   10    9    0    7    0
ASV_3    5   10    9   12    6    8    6     0     7    7    9   14    0    9
ASV_4   27    7    0 2160  593 4277 9956 26076 16018   29   26   32   18   26
ASV_5    0    0    0    6    0    0    0     6     0    0    8    0    0    0
ASV_6   76    8   37   42   45   46   17    17    36   47   46   55   41   28
      LGB3 LGC1 LGC2 LGC3 MGA1 MGA2 MGA3 MGB1 MGB2 MGB3 MGC1 MGC2 MGC3 HGA1
ASV_1   14   14   22   15 1902 2016 1818  857 1057 1043 5721 7434 6074 3988
ASV_2   14   18   19   18 3092 3163 2930  475  562  597 7135 8660 6688   59
ASV_3   13    0    8   14 1732 1771  744 1650 1702 1685 1387 2662 1587 3670
ASV_4   14   31   13    0    0    4    0    0   14   39   23    0    0    9
ASV_5    7    0    0    0  831  777  846 1457 1521 1410 1211 1417 1420 1418
ASV_6   40   31   28   14  797  858  828 1739 2246 2635  753 1408  654 1288
      HGA2 HGA3 HGB1 HGB2  HGB3 HGC1 HGC2 HGC3
ASV_1 4654 3879 2739 2880  2882 4301 4583 4977
ASV_2   53   44 8876 9833 10042  183  183  202
ASV_3 1796 1736  645  661  1255 5343 3452 4157
ASV_4    0    0   12   37    14    0    0    6
ASV_5 1696 1467  487  526   471 1939 1885 2001
ASV_6 1571 1172  662  677   736 1515 1524 1976
# env and group info.
env <- read.csv("data/microbiomeMarker/env.csv",header = TRUE,row.names = 1)
env %>%
  ggpubr::mutate(
    sampleID = row.names(.),
    treats = paste(env$grazing,env$depth,sep = "")) %>%
  select(sampleID,treats,everything())-> env

env %>%
  head()
     sampleID treats grazing depth    TN   TP   TK Ammonia    AP   OM       OC
CKA1     CKA1    CKA      CK     A 1.410 1.11 0.70    1.48 30.40 24.2 14.03779
CKA2     CKA2    CKA      CK     A 1.450 1.15 0.69    1.90 29.30 20.1 11.66248
CKA3     CKA3    CKA      CK     A 1.470 1.16 0.66    2.04 25.94 18.3 10.59294
CKB1     CKB1    CKB      CK     B 1.456 1.08 0.67    1.77 35.40 23.9 13.85225
CKB2     CKB2    CKB      CK     B 1.448 1.28 0.72    1.99 32.38 19.2 11.12560
CKB3     CKB3    CKB      CK     B 1.422 1.10 0.70    2.00 27.64 19.3 11.21479
# ASV taxonomy
tax <- read.csv("data/microbiomeMarker/taxonomy.csv",header = TRUE,row.names = 1)
tax %<>% as.matrix() 
tax %>% head()
      Kingdom       Phylum                Class                   
ASV_1 "d__Bacteria" "p__Proteobacteria"   "c__Gammaproteobacteria"
ASV_2 "d__Bacteria" "p__Proteobacteria"   "c__Gammaproteobacteria"
ASV_3 "d__Bacteria" "p__Actinobacteriota" "c__Acidimicrobiia"     
ASV_4 "d__Bacteria" "p__Bacteroidota"     "c__Bacteroidia"        
ASV_5 "d__Bacteria" "p__Proteobacteria"   "c__Alphaproteobacteria"
ASV_6 "d__Bacteria" "p__Cyanobacteria"    "c__Cyanobacteriia"     
      Order                 Family               
ASV_1 "o__Pseudomonadales"  "f__Nitrincolaceae"  
ASV_2 "o__Enterobacterales" "f__Alteromonadaceae"
ASV_3 "o__Actinomarinales"  "f__Actinomarinaceae"
ASV_4 "o__Bacteroidales"    "f__Marinifilaceae"  
ASV_5 "o__Rhodobacterales"  "f__Rhodobacteraceae"
ASV_6 "o__Synechococcales"  "f__Cyanobiaceae"    
      Genus                           
ASV_1 "g__Marinobacterium"            
ASV_2 "g__Glaciecola"                 
ASV_3 "g__Candidatus_Actinomarina"    
ASV_4 "g__Marinifilaceae_unclassified"
ASV_5 "g__Roseobacter"                
ASV_6 "g__Synechococcus_CC9902"       
      Species                                  
ASV_1 "s__Marinobacterium_unclassified"        
ASV_2 "s__Glaciecola_unclassified"             
ASV_3 "s__Candidatus_Actinomarina_unclassified"
ASV_4 "s__Marinifilaceae_unclassified"         
ASV_5 "s__uncultured_Roseobacter_sp."          
ASV_6 "s__Synechococcus_CC9902_unclassified"   
## Data preparation
library(phyloseq)
physeq <- phyloseq(
  otu_table(otu,taxa_are_rows = TRUE), 
  tax_table(tax),
  sample_data(env)
  )
physeq
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 31182 taxa and 36 samples ]
sample_data() Sample Data:       [ 36 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 31182 taxa by 7 taxonomic ranks ]
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("microbiomeMarker")

# Data standardization - leveling
library(microbiomeMarker) # 7 standardized methods。
##?normalize
set.seed(12345)
data <- microbiomeMarker::normalize(physeq, method = "rarefy")
data
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 25486 taxa and 36 samples ]
sample_data() Sample Data:       [ 36 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 25486 taxa by 7 taxonomic ranks ]
# Data filtering to keep only the bacteria domain
data <- phyloseq::subset_taxa(
    data,
    Kingdom %in% c("d__Bacteria")
)
data
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 23865 taxa and 36 samples ]
sample_data() Sample Data:       [ 36 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 23865 taxa by 7 taxonomic ranks ]
## MicrobiomeMarker for lefse analysis
# lefse analysis
# Lefse analysis - LDA threshold set to 4
args(run_lefse) # View function parameters
function (ps, group, subgroup = NULL, taxa_rank = "all", transform = c("identity", 
    "log10", "log10p"), norm = "CPM", norm_para = list(), kw_cutoff = 0.05, 
    lda_cutoff = 2, bootstrap_n = 30, bootstrap_fraction = 2/3, 
    wilcoxon_cutoff = 0.05, multigrp_strat = FALSE, strict = c("0", 
        "1", "2"), sample_min = 10, only_same_subgrp = FALSE, 
    curv = FALSE) 
NULL
#?run_lefse
adj <- tax_table(data) %>% apply(.,2,function(x) length(unique(x))) %>% sum
# This function does not provide a difference test p-value correction parameter, so you can correct it yourself.
lefse <- run_lefse(
    data,  
    norm = "CPM",
    group = "grazing",
    multigrp_strat = TRUE,
    wilcoxon_cutoff = 0.001, 
    kw_cutoff = 0.01/adj, # bonferroni
    bootstrap_n = 50, # bootstrap times
    lda_cutoff = 4 
) 

## Extract biomarker identification results
# Taxa without annotated names will be named according to the annotation results of the previous taxa.
lefse %>% marker_table() -> res.diff
# write.csv(res.diff,"lda4_diff.csv",quote = FALSE)
# res.diff = as.data.frame(res.diff)
# write.csv(res.diff,"results/lda4_diff.csv")
dim(res.diff) # 61个biomarker。
[1] 55  5
res.diff %>% head()
                                                                                                                                                             feature
marker1                                                                                                                                    d__Bacteria|p__Firmicutes
marker2                                                                                                                      d__Bacteria|p__Firmicutes|c__Clostridia
marker3                                   d__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Hyphomicrobiaceae|g__Hyphomicrobiaceae_unclassified
marker4 d__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Hyphomicrobiaceae|g__Hyphomicrobiaceae_unclassified|s__Hyphomicrobiaceae_unclassified
marker5                                                                                                         d__Bacteria|p__Proteobacteria|c__Alphaproteobacteria
marker6                                                                     d__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Alphaproteobacteria_unclassified
        enrich_group   ef_lda       pvalue         padj
marker1           CK 4.393701 9.966991e-07 9.966991e-07
marker2           CK 4.323842 1.004466e-06 1.004466e-06
marker3           CK 4.045791 1.569234e-06 1.569234e-06
marker4           CK 4.045791 1.569234e-06 1.569234e-06
marker5           HG 5.224315 1.143917e-06 1.143917e-06
marker6           HG 4.314859 1.302684e-06 1.302684e-06
## Visualization of lefse analysis results
# cladogram
cols <- RColorBrewer::brewer.pal(8, "Dark2")
pdf("results/Biomarker_identification/LEfSe/microbiomeMarker_cladogram_lda_marke_only.pdf",
    width = unit(16,"cm"),
    height = unit(16,"cm"),
    family="Times")
plot_cladogram(
  lefse, 
  # The number of colors needs to be consistent with the number of enrich_group classification levels.
  color = cols[seq_along(res.diff$enrich_group %>% unique)],
  # FALSE displays all data in the graph, TRUE indicates that only data with significant differences are displayed.
  only_marker = TRUE,
  branch_size = 0.2, 
  alpha = 0.2,
  node_size_scale = 1, 
  node_size_offset = 1.1,
  # Add the largest taxonomic unit of the branch label, and the remaining taxonomic unit annotations are used as legends. The larger the value, the higher the taxonomic unit represented.
  clade_label_level = 7,
  clade_label_font_size = 2,# 分支标签大小
  annotation_shape = 22,
  annotation_shape_size = 2,
  marker_legend_param = list(
    ncol = 2, 
    direction = "horizontal"
  )
  ) +
  theme(
    plot.margin = margin(0, 0, 0, 0),
    legend.position = "bottom"
    )
dev.off()
png 
  2 
## LDA effect size plot
# Bar plot
args(plot_ef_bar)
function (mm, label_level = 1, max_label_len = 60, markers = NULL) 
NULL
pdf("results/Biomarker_identification/LEfSe/microbiomeMarker_ef_bar.pdf",
    width = unit(16,"cm"),
    height = unit(16,"cm"),
    family="Times")
plot_ef_bar(lefse,max_label_len = max(str_length(res.diff$feature))) +
  scale_fill_manual(values = cols[1:3])
dev.off()
png 
  2 
## Bubble plot
args(plot_ef_dot)
function (mm, label_level = 1, max_label_len = 60, markers = NULL) 
NULL
pdf("results/Biomarker_identification/LEfSe/microbiomeMarker_ef_dot.pdf",
    width = unit(16,"cm"),
    height = unit(16,"cm"),
    family="Times")
plot_ef_dot(lefse,
            max_label_len = max(str_length(res.diff$feature)))+
  scale_color_manual(values = cols[1:3])
dev.off()
png 
  2 
## Lollipop
p1 <- plot_ef_dot(lefse,
            max_label_len = max(str_length(res.diff$feature)))+
  scale_color_manual(values = cols[1:3])+
  geom_segment(ggplot2::aes(x= 4,xend = effect_size,y = feature, yend = feature))
  
#p1$data
ggplot2::ggsave("results/Biomarker_identification/LEfSe/microbiomeMarker_ef_Lollipop.pdf",
       p1,
       device = "pdf",
    width = unit(16,"cm"),
    height = unit(16,"cm"),
    family="Times")

## Taxonomic unit abundance difference map
args(plot_abundance)
function (mm, label_level = 1, max_label_len = 60, markers = NULL, 
    group) 
NULL
p2 <- plot_abundance(lefse, 
                     group = "grazing",
                     max_label_len = max(str_length(res.diff$feature)))+
  scale_fill_manual(values = cols[1:4],breaks = c("CK","LG","MG","HG"))

## Change details
p2$data$grazing <- factor(p2$data$grazing, levels = c("CK","LG","MG","HG")) # set factor level
p2$data$abd <- log10(p2$data$abd + 1) 
p2 <- p2 + labs(x = "log10(Abundance +1)") 
ggplot2::ggsave("results/Biomarker_identification/LEfSe/microbiomeMarker_abundance.pdf",
       p2,
       device = "pdf",
    width = unit(16,"cm"),
    height = unit(20,"cm"),
    family="Times")


## The third method: online Lefse analysis combined with local drawing beautification
library(magrittr)
library(dplyr)
library(ggplot2)
library(ggpubr)

# metacyc data (data organization)
otutable <-  read.table("data/metacyc_lefse.txt",header = TRUE,row.names = 1,sep = "\t")
otutable = data.frame(otutable,stringsAsFactors = F)
otutable2 = as.data.frame(lapply(otutable, as.numeric))
otutable2$pathways = rownames(otutable)
otutable2 = otutable2[, c(21, 1:20)]
metadata <- read.table("data/group_lefse.txt", header=T)
metadata$Group <- NULL
colnames(metadata)[2] <- 'Group'
rownames(metadata) <- metadata$sample

RA <- otutable2
rownames(RA) <- RA$pathways
RA1 = RA[,2:ncol(RA)]
RA1=as.data.frame(t(RA1))
RA1$sample = rownames(RA1)
RA1 = merge(metadata,RA1,by='sample')
rownames(RA1) = RA1$sample
RA1 = RA1[,-1]
RA1=as.data.frame(t(RA1))
#write.table(RA1,file = 'results/Biomarker_identification/LEfSe/lefse_metacyc_pathways.txt',quote = FALSE,sep = "\t", row.names = T,col.names = T)

# Lefse online analysis
# lefse analysis was performed in website https://www.bioincloud.tech/task-meta

# load lefse results
lefse = read.table('data/lefse_selected.txt',header=T,sep='\t',stringsAsFactors = F)
lefse = as.data.frame(lefse)
lefse = lefse[which(lefse$EnrichedGroups != ''),] 
lefse = lefse[which(lefse$LDA>=3.0),]

# When EnrichGroups are 2, one of the LDA groups takes a negative value
if(length(unique(lefse$EnrichedGroups))==2){
  lefse[lefse$EnrichedGroups==unique(lefse$EnrichedGroups)[1],4] =
    0 - lefse[lefse$EnrichedGroups==unique(lefse$EnrichedGroups)[1],4]
}

# Sort by LDA size when EnrichGroups are 2, and sort by size within each EnrichGroup when EnrichGroups are greater than 2
if(length(unique(lefse$EnrichedGroups))==2){
  lefse=lefse[order(lefse$LDA,decreasing = T),]
} else {lefse=lefse[order(lefse$Region,lefse$Disease,lefse$LDA,decreasing = F),]}

lefse$KW_Pvalue = as.numeric(lefse$KW_Pvalue)
lefse$EnrichedGroups = factor(lefse$EnrichedGroups,levels = c('Patients','Healthy'))
lefse$Biomarkernames = factor(lefse$Biomarkernames, levels = lefse$Biomarkernames)

lefse = lefse %>%
  ggpubr::mutate(EnrichedGroups = ordered(EnrichedGroups,
                         levels=c("Healthy","Patients")))
lefse$Biomarkernames = factor(lefse$Biomarkernames,levels = as.character(lefse$Biomarkernames))
g_metacyc_lefse <- ggplot(lefse,ggplot2::aes(x = Biomarkernames,y = LDA, fill = EnrichedGroups)) + 
  scale_y_continuous(limits = c(-4.5,4),breaks=seq(-4, 4, 1))+
  geom_bar(stat = 'identity',colour = 'black',width = 0.8,position = position_dodge(0.7))+ 
  xlab('') + ylab('LDA SCORE (log 10)') + coord_flip() + bgcolor("white")+
  ggplot2::theme_bw() + labs(fill = "Group")+
  geom_hline(yintercept = c(-4, -3, -2, -1, 0, 1, 2, 3, 4),
             linetype=2,
             alpha=0.6,
             color='black',
             lwd=0.3)+
  theme(legend.position = "bottom")+
  #scale_fill_manual(values = c("#00C0D8","#FF6060"))+
  #scale_color_manual(values = c("#00C0D8","#FF6060"))+
  scale_fill_manual(values = c("#36aecc","#975896"))+
  scale_color_manual(values = c("#36aecc","#975896"))+
  theme(axis.text.y = element_blank(),axis.ticks = element_blank()
        ) +
  theme(panel.border = element_blank(), panel.grid = element_blank()) +
  geom_text(ggplot2::aes(y = ifelse(lefse$LDA >0,-0.1,0.1),label=Biomarkernames),fontface=1,size=4,hjust = ifelse(lefse$LDA>0,1,0))
ggplot2::ggsave(paste("results/Biomarker_identification/LEfSe/patients_healthy_metacyc_lefse_3.0",".pdf", sep=""), g_metacyc_lefse, width=149 * 1.5, height=170 * 1.5, unit='mm')
g_metacyc_lefse

Machine learning (机器学习)

# Biomarker screening using random forest models
# rm(list=ls())
# Load packages
library(reshape2)
library(ggplot2)
library(ggprism)
library(dplyr)
library(plyr)
library(caret)
library(randomForest)
#install.packages("PRROC")
library(PRROC)
library(ROCR)
library(pROC)
library(yardstick)
library(patchwork)
library(cols4all)
library(openxlsx)
library(tidyverse)

conflicts_prefer(ggplot2::theme_classic)
conflicts_prefer(base::setdiff)
conflicts_prefer(ggplot2::theme_bw)

# Loading settings and functions
source("function/randomforest.crossvalidation.R")

# Set theme
mytheme = theme_classic() + 
  theme(text = element_text(family = "sans", size = 10))+
  theme(#legend.position="none",
    legend.text = element_text(size=8),
    legend.title = element_blank(), 
    panel.background = element_blank(),
    panel.grid = element_blank(),
    axis.text.y = element_text(size=10, colour="black", family = "sans", angle = 0), 
    axis.text.x = element_text(size=10, colour="black", family = "sans", angle = 0, hjust = 0),
    axis.title= element_text(size=12, family = "sans"),
    strip.text.x = element_text(size=10, angle = 0),
    strip.text.y = element_text(size=10, angle = 0),
    panel.border = element_rect(colour = "black"),
    plot.title = element_text(size=10, angle = 0),
    strip.background.x = element_rect(fill = "#E5E4E2", colour = "black", size = 0.5),
    legend.position = c(0.85, 0.65),
    )+
      theme(axis.text.x=element_text(angle=0,vjust=1, hjust=0.6))+
  theme(axis.line = element_line(size = 0.2, colour = "black"))


# 1.Load data
# metadata 
design <- read.table(file = "data/group_RF.txt", sep = "\t", header = T, row.names=1)
# 60 samples
df_species <- read.table(file = "data/species_data_RF.txt", sep = "\t", header = T, check.names = FALSE)

# sum of Species
data_species <- aggregate(.~ Species, data = df_species, sum)
rownames(data_species) = data_species$Species
data_species = data_species[, -1]
data_species_ra = apply(data_species, 2, function(x) x/100)

# Screening microbial species prevalence > 5%
zero_counts <- vector("integer", nrow(data_species))
for (i in 1:nrow(data_species)) {
  count <- 0
  for (j in 1:ncol(data_species)) {
    if (data_species[i, j] == 0) {
      count <- count + 1 
    }
  }
  zero_counts[i] <- count
}
# Output
zero_count = as.data.frame(zero_counts)
data_species2 = data_species
data_species2$zero_counts = zero_count$zero_counts
data_species2$all_counts = 60
data_species2$sample_percent = round(1-data_species2$zero_counts/data_species2$all_counts, 6)
data_species3 = data_species2 %>% filter(data_species2$sample_percent >= 0.05)
data_species3 = data_species3[, -c(61, 62, 63)]

# Among the bacteria that account for more than 5% of the sample, check whether the corresponding bacterial abundance in each sample exceeds 0.01%, and select bacteria with a relative abundance exceeding 0.01%.
data_species3 = apply(data_species3, 2, function(x) x/sum(x))
data_species3 = as.data.frame(data_species3)
count_t_values = apply(data_species3, 1, function(x)sum(x>=0.0001))
count_t_values = as.data.frame(count_t_values)
data_species3$count_t_values = count_t_values$count_t_values
data_species3$all_counts = 60
data_species3$t_percent = round(data_species3$count_t_values/data_species3$all_counts, 6)
data_species4 = data_species3 %>% filter(data_species3$t_percent >= 0.05)
data_species4 = data_species4[, -c(61, 62, 63)]

# The data were first log10 transformed
data_species5 = log10(data_species4 + 1e-05)

## 2.Data split
# z-score normalization
data_species6 = apply(data_species5, 1, function(x){
  return((x-mean(x))/sd(x))
})
data_species6 = t(data_species6)
#write.csv(data6, "results/rf_model_species_used.csv")

# Select the previously deduplicated data for analysis
otutab = data_species6
design2 = design

# Select by manual set group
if (TRUE){
  sub_design = subset(design2, Group %in% c("Patients","Control")) 
  sub_design$group  = factor(sub_design$Group, levels=c("Patients","Control"))
}
idx = rownames(sub_design) %in% colnames(otutab)
sub_design = sub_design[idx,]
sub_otutab = otutab[,rownames(sub_design)]

# Create data partition
# Divide the data into training set and test set. # Here, the training set and test set are divided in a ratio of 7:3. 70% of the 60 samples are about 42, and the remaining 18 samples account for about 30%.
otutab_t_species = as.data.frame(t(sub_otutab))
# Set classification info.
otutab_t_species$group = factor(sub_design$Group, levels = c("Patients","Control"))
otutab_t_species = na.omit(otutab_t_species)
row.name = rownames(otutab_t_species)
# 60 samples
set.seed = 515
sam.row.name = sample(row.name, 42, replace = FALSE)
train_data_species = otutab_t_species[sam.row.name, ]
unique_rows_df1 <- setdiff(rownames(otutab_t_species), rownames(train_data_species))
test_data_species <- otutab_t_species[unique_rows_df1, ]
#test_data_species = setdiff(otutab_t_species, train_data_species)

## 3.Model training
# load data
dat1_species <- train_data_species
conf_species <- as.data.frame(dat1_species$group)
rownames(conf_species) <- rownames(dat1_species)
colnames(conf_species) <- "Group"
conf_species$sample <- rownames(conf_species)
conf_species <- as.data.frame(conf_species)
dat2_species <- dat1_species
conf2_species <- conf_species
conf2_species$Group = as.factor(as.character(conf2_species$Group))
outcome_species = conf2_species$Group
outcome_species <- sub("Control","0",outcome_species)
outcome_species <- sub("Patients","1",outcome_species)
outcome_species <-as.factor(outcome_species)
dat_species <- dat2_species
X_species <- as.data.frame(dat_species)
X_species$outcome_species = outcome_species
X_species <- X_species[, -612]

## 5*10_crossvalidation
set.seed(999)
result_species <- replicate(5, rfcv1(X_species[,-ncol(X_species)], 
                                     X_species$outcome_species, 
                                     cv.fold=10,step=0.9), simplify=FALSE)
error.cv <- sapply(result_species, "[[", "error.cv")
matplot(result_species[[1]]$n.var, cbind(rowMeans(error.cv), error.cv), type="l",
        lwd=c(2, rep(1, ncol(error.cv))), col=1, lty=1, log="x",
        xlab="Number of variables", ylab="CV Error")
error.cv.cbm <- cbind(rowMeans(error.cv), error.cv)
cutoff <- min (error.cv.cbm[,1])+sd(error.cv.cbm[,1])
error.cv.cbm[error.cv.cbm[,1] < cutoff,]
          [,1]       [,2]       [,3]       [,4]       [,5]       [,6]
611 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
550 0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
495 0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
445 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
401 0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
361 0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
325 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
292 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
263 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
237 0.02857143 0.04761905 0.02380952 0.02380952 0.02380952 0.02380952
213 0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
192 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
173 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
140 0.02857143 0.02380952 0.02380952 0.02380952 0.02380952 0.04761905
126 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
113 0.03333333 0.02380952 0.04761905 0.02380952 0.02380952 0.04761905
102 0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
92  0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
83  0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
74  0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
67  0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
60  0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
54  0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
39  0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
36  0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
26  0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
23  0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
12  0.03333333 0.02380952 0.04761905 0.02380952 0.02380952 0.04761905
11  0.03333333 0.02380952 0.04761905 0.02380952 0.02380952 0.04761905
10  0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
9   0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
8   0.03333333 0.07142857 0.02380952 0.02380952 0.02380952 0.02380952
4   0.03333333 0.02380952 0.07142857 0.02380952 0.02380952 0.02380952
3   0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
abline(v=6,col="pink",lwd=2)

optimal = 6
error.cv.cbm2 <- as.data.frame(error.cv.cbm)
error.cv.cbm2$num <- rownames(error.cv.cbm2)
n.var = error.cv.cbm2$num
n.var = as.numeric(n.var)
error.cv = error.cv.cbm2[,1:5]
colnames(error.cv) = paste('err',1:5,sep='.')
err.mean = apply(error.cv,1,mean)
allerr = data.frame(num=n.var,err.mean=err.mean,error.cv)
allerr = as.data.frame(allerr)
write.table(allerr, file = "results/Biomarker_identification/Machine_learning/Species_rfcv_5_10_new.txt", 
            sep = "\t", quote = F, row.names = T, col.names = T)

conflicts_prefer(ggplot2::aes)
conflicts_prefer(ggplot2::annotate)
conflicts_prefer(ggplot2::ggsave)
allerr <- read.table(file = "results/Biomarker_identification/Machine_learning/Species_rfcv_5_10_new.txt", 
                     sep = "\t", header = T, row.names=1)
mytheme3 = theme_bw() + theme(text = element_text(family = "sans", size = 7))+
  theme(legend.position="none",
    legend.text = element_text(size=14),
    legend.title = element_blank(), 
    panel.background = element_blank(),
    panel.grid = element_blank(),
    axis.text.y = element_text(size=14, colour="black", family = "sans", angle = 0), 
    axis.text.x = element_text(size=14, colour="black", family = "sans", angle = 0, hjust = 0),
    axis.title= element_text(size=14),
    strip.text.x = element_text(size=14, angle = 0),
    strip.text.y = element_text(size=14, angle = 0),
    plot.title = element_text(size=14, angle = 0),
    strip.background.x = element_rect(fill = "#E5E4E2", colour = "black", size = 0.2))+
      theme(axis.text.x=element_text(angle=0,vjust=1, hjust=0.6))+
  theme(axis.line = element_line(size = 0.1, colour = "black"))

p01_species = ggplot(allerr, aes(x=allerr$num)) + 
  geom_line(data = allerr, aes(x = allerr$num, y = allerr$err.1), colour = 'grey') +
  geom_line(data = allerr, aes(x = allerr$num, y = allerr$err.2), colour = 'grey') +
  geom_line(data = allerr, aes(x = allerr$num, y = allerr$err.3), colour = 'grey') +
  geom_line(data = allerr, aes(x = allerr$num, y = allerr$err.4), colour = 'grey') +
  geom_line(data = allerr, aes(x = allerr$num, y = allerr$err.5), colour = 'grey') +
  geom_line(data = allerr, aes(x = allerr$num, y = allerr$err.mean), colour = 'black') + 
  geom_vline(xintercept = optimal, colour='black', lwd=0.36, linetype="dashed") + 
  geom_hline(yintercept = 0.05976941, colour='black', lwd=0.36, linetype="dashed") +
  mytheme3+
  coord_trans(x = "log2") +
  scale_x_continuous(breaks = c(10, 30, 50, 100, 200, 400)) + # , max(allerr$num)
  labs(#title=paste('Training set (n = ', dim(train_data_species)[1],')', sep = ''),
      x='Number of species ', y='Cross-validation error rate') +
  annotate("text", x = optimal, y = max(allerr$err.mean), label=paste("optimal = ", optimal, sep="")) +
  #main_theme+ 
  theme_bw() + theme(panel.background = element_blank(),
        panel.grid.major =element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "none",
        axis.title= element_text(size=10, family = "sans"))
ggsave("results/Biomarker_identification/Machine_learning/Species_rfcv_5_10_top6.pdf",
       p01_species,width = 5,height = 3.2)
p01_species

#####pick 32 marker by corossvalidation#######
k=1
b <- matrix(0,ncol=611,nrow=50)
for(i in 1:5){
  for(j in 1:10){
    b[k,]<-result_species[[i]]$res[[j]]
    k=k+1
  }}
mlg.list<-b[,1:6]
list<-c()
k=1
for(i in 1:6){
  for(j in 1:50){
    list[k]<-mlg.list[j,i]
    k=k+1
  }}
mlg.sort<-as.matrix(table(list))
mlg.sort<-mlg.sort[rev(order(mlg.sort[,1])),]
pick_species<- as.numeric(names(head(mlg.sort,6)))
tmp= X_species[,-ncol( X_species)]
mlg.pick.species<-colnames(tmp)[pick_species]
write.table(mlg.pick.species,"results/Biomarker_identification/Machine_learning/cross_validation_pick_6_in_species.txt",
            sep="\t",quote=F)

## train.set
## Comparison of the probability of disease prediction between the disease group and the healthy control group
train1_species <- X_species[,c(pick_species,612)]
train1_species <-data.frame(train1_species)
set.seed(32)
train1.rf_species <- randomForest(outcome_species~., data =train1_species,
                          importance = TRUE)
train1.pre_species <- predict(train1.rf_species,type="prob")
p.train_species <- train1.pre_species[,2]
#boxplot(p.train~outcome,col=c(3,4),main="Probability of Patients")
write.table(p.train_species,"results/Biomarker_identification/Machine_learning/species.cross_validation.6makr.predict.in.train.txt",
            sep="\t",quote=F)

train1_pre2_species <- data.frame(outcome_species, p.train_species)
train1_pre2_species$outcome_species <- as.factor(train1_pre2_species$outcome_species)
train1_pre2_species$outcome_species <- sub("0","Healthy",train1_pre2_species$outcome_species)
train1_pre2_species$outcome_species <- sub("1","Patients",train1_pre2_species$outcome_species)
compaired = list(c("Healthy", "Patients"))

library(ggsignif)
library(scales)
compaired2 = list(c("Healthy", "Patients"))
train1_pre2_species <- read.table(file = "results/Biomarker_identification/Machine_learning/species.cross_validation.6makr.predict.in.train_box.txt", 
                                  sep = "\t", header = T, row.names=1)
p02_species <- ggplot(train1_pre2_species, aes(x=outcome_species, y=p.train_species, fill=outcome_species)) + 
  geom_boxplot(position=position_dodge(width =0.4),width=0.5, size = 0.4,
               fill = "transparent", 
               outlier.shape = NA,
               linetype = "dashed")+
  #theme_bw()+ 
  theme_classic()+
  labs(x = NULL, y = "Probability of Patients", color = outcome_species)+
  geom_jitter(aes(color=outcome_species),position = position_jitter(0.15), 
                size = 0.3, alpha = 1)+
  stat_boxplot(geom = "errorbar",aes(ymin=..ymax..),
               width=0.18,color="black",size = 0.4)+
  stat_boxplot(geom = "errorbar",aes(ymax=..ymin..),
               width=0.18,color="black",size = 0.4)+
  stat_boxplot(aes(ymin=..lower..,ymax=..upper.., fill=outcome_species), color="black",
               fill = "transparent",position=position_dodge(width =0.4),
               width=0.5, size = 0.4,outlier.shape = NA)+
  geom_signif(comparisons = compaired2, step_increase = 0.3, map_signif_level = F,
            test = wilcox.test, color = "black", size = 0.2, textsize = 3)+
  scale_y_continuous(labels = label_number(accuracy = 0.1)) +
  scale_fill_manual(values = c("#46a9cb","#945893"))+
  scale_color_manual(values = c("#46a9cb","#945893"))+
  theme(panel.background = element_blank(), panel.grid.major =element_blank(),
        panel.grid.minor = element_blank(), legend.position = "none",
        axis.text = element_text(size=10, family = "sans"),
        axis.title= element_text(size=10, family = "sans"),
        text = element_text(family = "sans", size = 10))
ggsave(paste("results/Biomarker_identification/Machine_learning/Species_6markers_patients_healthy_boxplot2",".pdf", sep=""), 
       p02_species, width=69 * 1.5, height=80 * 1.5, unit='mm')
p02_species

# Mean Decrease Accuracy refers to evaluating the importance of each feature in a random forest by calculating the importance of the feature. The importance is calculated based on the reduced accuracy of each feature point before and after randomization in each decision tree of the random forest. Mean decrease accuracy is an effective method for selecting feature importance, which can help us screen out the most important features in various machine learning problems.
varImpPlot(train1.rf_species, main = "Top feature importance", n.var = 6)

write.table(train1.rf_species$confusion, file = "results/Biomarker_identification/Machine_learning/Species_confusion_rf2.txt", 
            sep = "\t", quote = F, row.names = T, col.names = T)
imp_species = as.data.frame(round(importance(train1.rf_species), 2))
imp_species = imp_species[order(imp_species$MeanDecreaseAccuracy, decreasing = F),]
write.table(imp_species, file = "results/Biomarker_identification/Machine_learning/Species_imp_rf2.txt", 
            sep = "\t", quote = F, row.names = T, col.names = T)

# Phylum
system("awk 'NR==FNR{a[$8]=$3} NR>FNR{print $0\"\t\"a[$1]}' data/taxonomy_RF.txt results/Biomarker_identification/Machine_learning/Species_imp_rf2.txt | sed '1 s/$/Phylum/' > results/Biomarker_identification/Machine_learning/Species_imp_phylum_rf2.txt")
[1] 2
# Family
system("awk 'NR==FNR{a[$8]=$6} NR>FNR{print $0\"\t\"a[$1]}' data/taxonomy_RF.txt results/Biomarker_identification/Machine_learning/Species_imp_rf2.txt | sed '1 s/$/Phylum/' > results/Biomarker_identification/Machine_learning/Species_imp_family_rf.txt")
[1] 2
# Bar plot (Phylum)
imp_species = read.table("results/Biomarker_identification/Machine_learning/Species_imp_phylum_rf2.txt", 
                         header=T, row.names= 1, sep="\t")
imp_species = tail(imp_species, n = optimal)
imp_species$Species = factor(rownames(imp_species), levels = rownames(imp_species))
p03_species = ggplot(imp_species, aes(x = Species, y = MeanDecreaseAccuracy, fill = Phylum)) + 
  geom_bar(stat = "identity") + theme_classic()+
  #scale_fill_manual(values = c("#63B8FF","orange","#4AB3AA", "#D10640"))+
  #  scale_color_manual(values = c("#63B8FF", "orange","#4AB3AA","#D10640"))+
  scale_fill_manual(values = c("#63B8FF","#4AB3AA", "#D10640"))+
    scale_color_manual(values = c("#63B8FF", "#4AB3AA","#D10640"))+
  coord_flip() + #main_theme+
  theme(legend.position = c(0.85,0.8))+
  scale_y_continuous(expand = c(0,0))+
  labs(y = "Mean Decrease Accuracy", x = "Species")
ggsave(paste("results/Biomarker_identification/Machine_learning/Species_top_feautre_top6markers_phylum",
             ".pdf", sep=""), p03_species, width=119 * 1.5, height=80 * 1.5, unit='mm')
p03_species

# Bar plot (Family)
imp_species = read.table("results/Biomarker_identification/Machine_learning/Species_imp_family_rf2.txt", 
                         header=T, row.names= 1, sep="\t")
imp_species = tail(imp_species, n = optimal)
imp_species$Species = factor(rownames(imp_species), levels = rownames(imp_species))
p04_species = ggplot(imp_species, aes(x = Species, y = MeanDecreaseAccuracy, fill = Family)) + 
  geom_bar(stat = "identity") + theme_classic()+
  coord_flip() + #main_theme+
  scale_fill_manual(values = c("#d2da93","#5196d5","#00ceff","#ff630d","#9b82e1",
                  "#e5acd7","#36999d","#ec8181","#dfc6a5","#e50719",
                  "#d27e43","#8a4984","#fe5094","#8d342e","#f94e54",
                  "#ffad00","#36999d","#00fc8d","#b64aa0","#9b82e1"))+
  scale_y_continuous(expand = c(0,0))+
  labs(y = "Mean Decrease Accuracy", x = "Species")
ggsave(paste("results/Biomarker_identification/Machine_learning/Species_top_feautre_top6markers_family",".pdf", sep=""), 
       p04_species, width=119 * 1.5, height=70 * 1.5, unit='mm')
p04_species

## 4.Train set ROC curve
# ROC in train set
roc1_species_train <- roc(outcome_species, p.train_species,
            ci=TRUE, boot.n=100, ci.alpha=0.9, stratified=FALSE,
            plot=TRUE#, percent=roc1$percent,col=2
            )

# Get AUROC mean and confidence interval
auc_species_train = round(roc1_species_train$auc,3)
roc1_species_train2 <- plot.roc(outcome_species, p.train_species, ci=TRUE, print.auc=TRUE)

ci_low_species_train = round(roc1_species_train2$ci[1], 3)
ci_high_species_train = round(roc1_species_train2$ci[3], 3)
# Calculate 95% confidence interval
roc.list <- list(roc1_species_train)
ci.list <- lapply(roc.list, ci.se, specificities = seq(0, 1, l = 25))
ciobj02 <- ci.se(roc1_species_train, # CI of sensitivity, random forest
               specificities=seq(0, 1, 0.01)) # over a select set of specificities
# Put multiple ROC confidence interval upper and lower thresholds into a data frame
ciobj3 <- as.data.frame(ciobj02)
dat.ci.list <- lapply(ci.list, function(ciobj3)
  data.frame(x = as.numeric(rownames(ciobj3)),
             lower = ciobj3[, 1],
             upper = ciobj3[, 3]))
# Plot
p1_species_train <- ggroc(roc.list, legacy.axes = TRUE) + #theme_minimal() +
  theme_bw()+ coord_equal()+ 
  theme(panel.background = element_blank(),
        panel.grid.major =element_blank(),
        panel.grid.minor = element_blank(),
        #axis.title= element_text(size=10, family = "sans"),
        #plot.title = element_text(size = 10, family = "sans", hjust = 0.5),
        #text = element_text(family = "sans", size = 10),
        legend.position = "none")+ 
  geom_abline(slope=1, intercept = 0, linetype = "dashed", alpha=0.5, color = "grey") + 
  coord_fixed(ratio = 0.9)+ ggtitle("Training set (n = 42)")+
  geom_line(size = 0.8)+labs(x = "1 - Specificity", y = "Sensitivity")+
  annotate("text", x = 0.77, y = 0.18, label = paste0("AUC = ", auc_species_train), size = 3)+
  annotate("text", x = 0.77, y = 0.08, label = paste0("CI = ",  ci_low_species_train, "-", ci_high_species_train), size = 3)+
  scale_color_manual(values=c("#CD3278"))

col.list = list("#CD3278")
# Add confidence interval
for(i in 1:1) {
  p1_species_train <- p1_species_train + geom_ribbon(
    data = dat.ci.list[[i]],
    aes(x = 1-x, ymin = lower, ymax = upper),
    fill = col.list[[i]],
    alpha = 0.3,
    inherit.aes = F)
}
#ggsave(paste("results/Biomarker_identification/Machine_learning/selected_6_species_model_auroc_train_set",".pdf", sep=""), p1_species_train, width=109 * 1.5, height=60 * 1.5, unit='mm')
p1_species_train

## 5.Test set ROC curve
# ROC in test set
#dat3_species_test <- test_data_species[,c(pick_species, 402)]
dat3_species_test <- test_data_species
dat3_species_test <- data.frame(dat3_species_test)

set.seed(32)
test_species <- predict(train1.rf_species, dat3_species_test, type="prob")
conf3_species_test <- as.data.frame(dat3_species_test$group)
rownames(conf3_species_test) <- rownames(dat3_species_test)
colnames(conf3_species_test) <- "Group"
conf3_species_test$sample <- rownames(conf3_species_test)

conflicts_prefer(base::intersect)

rN.test <- rownames(test_species)
rN.test <- sub("X","",rN.test)
rN.conf <- rownames(conf3_species_test)
gid <- intersect(rN.test ,rN.conf)
test_species <- test_species[pmatch(gid, rN.test), ]
conf3_species_test <- conf3_species_test[pmatch(gid, rN.conf), ]
write.table(test_species[, 2],"results/Biomarker_identification/Machine_learning/species.cross_validation.6makr.predict.in.test.txt",
            sep="\t",quote=F)

compaired2 = list(c("Healthy", "Patients"))
test1_pre2_species <- read.table(file = "results/Biomarker_identification/Machine_learning/species.cross_validation.6makr.predict.in.test_box.txt", 
                                 sep = "\t", header = T, row.names=1)
p02_species_test <- ggplot(test1_pre2_species, aes(x=outcome_species, y=p.test_species, fill=outcome_species)) + 
  geom_boxplot(position=position_dodge(width =0.4),width=0.5, size = 0.4,
               fill = "transparent", 
               outlier.shape = NA,
               linetype = "dashed")+
  #theme_bw()+ 
  theme_classic()+ 
  labs(x = NULL, y = "Probability of Patients", color = outcome_species)+
  geom_jitter(aes(color=outcome_species),position = position_jitter(0.15), 
                size = 0.3, alpha = 1)+
  stat_boxplot(geom = "errorbar",aes(ymin=..ymax..),
               width=0.18,color="black",size = 0.4)+
  stat_boxplot(geom = "errorbar",aes(ymax=..ymin..),
               width=0.18,color="black",size = 0.4)+
  stat_boxplot(aes(ymin=..lower..,ymax=..upper.., fill=outcome_species), color="black",
               fill = "transparent",position=position_dodge(width =0.4),
               width=0.5, size = 0.4,outlier.shape = NA)+
  geom_signif(comparisons = compaired2, step_increase = 0.3, map_signif_level = F,
            test = wilcox.test, color = "black", size = 0.2, textsize = 3)+
  scale_y_continuous(labels = label_number(accuracy = 0.1)) +
  scale_fill_manual(values = c("#46a9cb","#945893"))+
  scale_color_manual(values = c("#46a9cb","#945893"))+
  theme(panel.background = element_blank(), panel.grid.major =element_blank(),
        panel.grid.minor = element_blank(), legend.position = "none",
        axis.text = element_text(size=10, family = "sans"),
        axis.title= element_text(size=10, family = "sans"),
        text = element_text(family = "sans", size = 10))
ggsave(paste("results/Biomarker_identification/Machine_learning/Species_6markers_patients_healthy_boxplot_test2",
             ".pdf", sep=""), p02_species_test, width=69 * 1.5, height=80 * 1.5, unit='mm')
p02_species_test

########test.ROC##########
outcome_species_test = conf3_species_test$Group
outcome_species_test <- sub("Control","0",outcome_species_test)
outcome_species_test <- sub("Patients", "1", outcome_species_test)
roc1_species_test <- roc(outcome_species_test, test_species[,2],
            ci=TRUE, boot.n=100, ci.alpha=0.9, stratified=FALSE,
            plot=TRUE#, percent=roc1$percent,col=2
            )

# AUROC
roc_species_test2 = roc(outcome_species_test, test_species[,2])
# roc_species_test2
# Get AUROC mean and confidence interval
auc_species_test = round(roc_species_test2$auc,3)
roc_species_test2_2 <- plot.roc(outcome_species_test, test_species[, 2],
                   ci=TRUE, print.auc=TRUE)

ci_low_species_test = round(roc_species_test2_2$ci[1], 3)
ci_high_sepcies_test = round(roc_species_test2_2$ci[3], 3)

# Calculate 95% confidence interval
roc.list <- list(roc_species_test2)
ci.list <- lapply(roc.list, ci.se, specificities = seq(0, 1, l = 25))
ciobj02 <- ci.se(roc_species_test2, # CI of sensitivity, random forest
               specificities=seq(0, 1, 0.01)) # over a select set of specificities

# Put multiple ROC confidence interval upper and lower thresholds into a data frame
ciobj3 <- as.data.frame(ciobj02)
dat.ci.list <- lapply(ci.list, function(ciobj3)
  data.frame(x = as.numeric(rownames(ciobj3)),
             lower = ciobj3[, 1],
             upper = ciobj3[, 3]))

# Plot
p1_species_test <- ggroc(roc.list, legacy.axes = TRUE) + #theme_minimal() +
  theme_bw()+
  theme(panel.background = element_blank(),
        panel.grid.major =element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "none") + coord_equal() + coord_fixed(ratio = 0.9)+
  geom_abline(slope=1, intercept = 0, linetype = "dashed", alpha=0.5, color = "grey") + 
  ggtitle("Testing set (n = 18)")+
  geom_line(size = 0.8)+labs(x = "1 - Specificity", y = "Sensitivity")+
  annotate("text", x = 0.77, y = 0.18, label = paste0("AUC = ", auc_species_test), size = 3)+
  annotate("text", x = 0.77, y = 0.08, label = paste0("CI = ",  ci_low_species_test, "-", ci_high_sepcies_test), size = 3)+
  scale_color_manual(values=c("#CD3278"))

col.list = list("#CD3278")
# Add confidence intervals
for(i in 1:1) {
  p1_species_test <- p1_species_test + geom_ribbon(
    data = dat.ci.list[[i]],
    aes(x = 1-x, ymin = lower, ymax = upper),
    #fill = i + 1,
    fill = col.list[[i]],
    alpha = 0.3,
    inherit.aes = F)
}
#ggsave(paste("results/Biomarker_identification/Machine_learning/all_selected_6_species_model_auroc_test_set_new",".pdf", sep=""), p1_species_test, width=109 * 1.5, height=60 * 1.5, unit='mm')
p1_species_test

p_species_all = p1_species_train + p1_species_test
ggsave(paste("results/Biomarker_identification/Machine_learning/all_selected_6_species_model_auroc_both_set_new",".pdf", sep=""), 
       p_species_all, width=109 * 1.5, height=60 * 1.5, unit='mm')
p_species_all

library(patchwork)
p_abc <- p01_species + p02_species + p02_species_test
p_abc

ggsave(paste("results/Biomarker_identification/Machine_learning/p_abc1",".pdf", 
             sep=""), p_abc, width=169 * 1.5, height=60 * 1.5, unit='mm')

p_abc2 <- p01_species | p02_species + p02_species_test
p_abc2

ggsave(paste("results/Biomarker_identification/Machine_learning/p_abc2",".pdf", 
             sep=""), p_abc2, width=140 * 1.5, height=60 * 1.5, unit='mm')

## 6.Train and test set ROC curve combination
# Train and test set
roc.list01 <- list(roc1_species_train, roc_species_test2)
ci.list01 <- lapply(roc.list01, ci.se, specificities = seq(0, 1, l = 25))
ciobj01 <- ci.se(roc1_species_train, # CI of sensitivity, random forest
               specificities=seq(0, 1, 0.01)) # over a select set of specificities
ciobj02 <- ci.se(roc_species_test2,
               specificities=seq(0, 1, 0.01))
# Put multiple ROC confidence interval upper and lower thresholds into a data frame
ciobj3 <- as.data.frame(ciobj01, ciobj02)
dat.ci.list <- lapply(ci.list01, function(ciobj3)
  data.frame(x = as.numeric(rownames(ciobj3)),
             lower = ciobj3[, 1],
             upper = ciobj3[, 3]))
# Plot
p_all_species <- ggroc(roc.list01, legacy.axes = TRUE) + #theme_minimal() +
  theme_bw()+
  theme(panel.background = element_blank(),
        panel.grid.major =element_blank(),
        panel.grid.minor = element_blank())+
  geom_abline(slope=1, intercept = 0, linetype = "dashed", alpha=0.5, color = "grey") + coord_equal()+
  theme(legend.position = c(0.70, 0.17))+coord_fixed(ratio = 0.9)+
  #ggtitle("Train set")+
  geom_line(size = 0.6)+labs(x = "1 - Specificity", y = "Sensitivity")+
  scale_color_manual(values=c("#5ebcc2","#d274ae","#849c4c"),#"#2d8c46",
                     name= "",
                      labels = c("Training set (AUC = 0.933 (CI = 0.978-1.000))", "Testing set (AUC = 1.000 (CI = 1.000-1.000))")
                      )
col.list = list("#5ebcc2","#d274ae","#849c4c")#"#2d8c46",
# Add confidence intervals
for(i in 1:2) {
  p_all_species <- p_all_species + geom_ribbon(
    data = dat.ci.list[[i]],
    aes(x = 1-x, ymin = lower, ymax = upper),
    fill = col.list[[i]],
    alpha = 0.2,
    inherit.aes = F)
}
ggsave(paste("results/Biomarker_identification/Machine_learning/all_selected_6_species_model_auroc_both_set_new02",
             ".pdf", sep=""), p_all_species, width=75 * 1.5, height=60 * 1.5, unit='mm')
p_all_species

## RF models with single species
# Select 32 top important species to run random forest models
# Data preparation
top_32 <- imp_species[1:6, ]
top_32_s <- data_species6[rownames(data_species6) %in% rownames(top_32), ]
top_32_s <- as.data.frame(t(top_32_s))

train_data_t32_2 <- top_32_s[rownames(top_32_s) %in% rownames(train_data_species), ]
train_data_t32_2 = as.data.frame(train_data_t32_2)
train_data_t32_2$group = rownames(train_data_t32_2)
train_data_t32_2$group = as.character(train_data_t32_2$group)
train_data_t32_2$group = gsub("[0-9]","", train_data_t32_2$group)

test_data_t32_2 <- top_32_s[rownames(top_32_s) %in% rownames(test_data_species), ]
test_data_t32_2 = as.data.frame(test_data_t32_2)
test_data_t32_2$group = rownames(test_data_t32_2)
test_data_t32_2$group = as.character(test_data_t32_2$group)
test_data_t32_2$group = gsub("[0-9]","", test_data_t32_2$group)

## Single species model predictions
# training set or testing set
library(ggpubr)
ROC_Single_Species_train <- function(species01_train, species01_test){
  species01_train$group = factor(species01_train$group, levels = c("Patients","Healthy"))
  set.seed(999)
  species01_train <-data.frame(species01_train)
  otutab_t.rf_species01 <- randomForest(group~., data =species01_train,
                          importance = TRUE)
  train1.pre_species <- predict(otutab_t.rf_species01,type="prob")
  p.train_species <- train1.pre_species[,2]
  roc1_species_train <- roc(species01_train$group, p.train_species,
            ci=TRUE, boot.n=100, ci.alpha=0.9, stratified=FALSE,
            plot=TRUE
            )
  # get average values of AUROC and confidence intervals
  auc_species_train = round(roc1_species_train$auc,3)
  roc1_species_train2 <- plot.roc(species01_train$group, p.train_species, ci=TRUE, print.auc=TRUE)
  ci_low_species_train = round(roc1_species_train2$ci[1], 3)
  ci_high_species_train = round(roc1_species_train2$ci[3], 3)
  # calculate 95% confidence intervals
  roc.list_top30 <- list(roc1_species_train)
  ci.list <- lapply(roc.list_top30, ci.se, specificities = seq(0, 1, l = 25))
  ciobj02 <- ci.se(roc1_species_train, 
               specificities=seq(0, 1, 0.01)) 
  # confidence intervals
  ciobj3 <- as.data.frame(ciobj02)
  dat.ci.list <- lapply(ci.list, function(ciobj3)
  data.frame(x = as.numeric(rownames(ciobj3)),
             lower = ciobj3[, 1],
             upper = ciobj3[, 3]))
  # plot
  p3_s01 <- ggroc(roc.list_top30, legacy.axes = TRUE) + 
    theme_bw()+
    theme(panel.background = element_blank(),
        panel.grid.major =element_blank(),
        panel.grid.minor = element_blank(),
        axis.title= element_text(size=12, family = "sans"),
        axis.text.x = element_text(size = 12, family = "sans"),
        axis.text.y = element_text(size = 12, family = "sans"),
        text = element_text(family = "sans", size = 12),
        legend.position = "none") + coord_equal() + coord_fixed(ratio = 0.9)+
    geom_abline(slope=1, intercept = 0, linetype = "dashed", alpha=0.5, color = "grey") + 
    geom_line(size = 0.3)+labs(x = "1 - Specificity", y = "Sensitivity")+
    ggtitle(colnames(species01_train)[1])+
    annotate("text", x = 0.75, y = 0.17, label = paste0("AUC = ", auc_species_train), size = 4)+
    annotate("text", x = 0.75, y = 0.1, label = paste0("CI = ",  ci_low_species_train, "-", ci_high_species_train), size = 4)+
  scale_color_manual(values=c("#74add1"))
  
  col.list = list("#74add1")
  # add confidence intervals
  for(i in 1:1) {
    p3_s01 <- p3_s01 + geom_ribbon(
      data = dat.ci.list[[i]],
      aes(x = 1-x, ymin = lower, ymax = upper),
      fill = col.list[[i]],
      alpha = 0.3,
      inherit.aes = F)
  }
  p3_s01
}

# create a list for ROC plots
plots <- list()
capture.output(suppressWarnings(for(i in 1: 6){
  species01 <- train_data_t32_2[, c(i,7)]
  p01 <- ROC_Single_Species_train(species01)
  plots[[i]] <- p01
}))

character(0)
# combine plots
conflicts_prefer(ggpubr::ggarrange)
final_plot <- ggarrange(plotlist = plots, nrow = 2, ncol = 3)
# save plots
pdf("results/Biomarker_identification/Machine_learning/ROC_single_species_train_data_top6_new4.pdf", 
    width = 12, height = 10)
final_plot
dev.off()
png 
  2 
final_plot

4.Correlation network analysis(4.相关和网络分析)

network analysis-1(SparCC network analysis)

SparCC Co-abudance network using R software

# 1. Data preparation
# SparCC correlations
# load packages
library(reshape2)
library(ggplot2)
library(ggprism)
library(dplyr)
library(plyr)
library(igraph)

# metadata 
design <- read.table(file = "data/group.txt", sep = "\t", header = T, row.names=1)
# Species data
df3 <- read.table(file = "data/species_data.txt", sep = "\t", header = T, check.names = FALSE)

# sum of Species
data<-aggregate(.~ Species,data=df3,sum)
rownames(data) = data$Species
data = data[, -1]

# retain microbiota speices with prevalence > 5% and relative abundance > 1E-4
#1.prevalence > 5%
zero_counts <- vector("integer", nrow(data))
for (i in 1:nrow(data)) {
  count <- 0
  for (j in 1:ncol(data)) {
    if (data[i, j] == 0) {
      count <- count + 1
    }
  }
  zero_counts[i] <- count
}
# output
zero_count = as.data.frame(zero_counts)
data2 = data
data2$zero_counts = zero_count$zero_counts
data2$all_counts = 20
data2$sample_percent = round(1-data2$zero_counts/data2$all_counts, 6)
data3 = data2 %>% filter(data2$sample_percent >= 0.05)
data3 = data3[, -c(21, 22, 23)]

# check data
dim(data3)
[1] 695  20
data3 = data3 * 100000
#write.table(sub_otutab_npc, file = "results/Correlation_network_analysis/sparcc_p01.txt", row.names = T, sep = "\t", quote = T, col.names = T)

OTU.table.filtered.colnames <- colnames(data3)
OTU.table.filtered.sparcc <- cbind(rownames(data3), data3)
colnames(OTU.table.filtered.sparcc) <- c("OTU_id", OTU.table.filtered.colnames)
OTU.table.filtered.sparcc2 <- t(OTU.table.filtered.sparcc)
OTU.table.filtered.sparcc2 <- OTU.table.filtered.sparcc2[-1,]
OTU.table.filtered.sparcc2 <- as.data.frame(OTU.table.filtered.sparcc2)

OTU.table.filtered.sparcc2$group <- rownames(OTU.table.filtered.sparcc2)
OTU.table.filtered.sparcc2$group = as.character(OTU.table.filtered.sparcc2$group)
#OTU.table.filtered.sparcc2$group = sub("[0-9]","_", OTU.table.filtered.sparcc2$group)
OTU.table.filtered.sparcc2$group = gsub("[0-9]","", OTU.table.filtered.sparcc2$group)
OTU.table.filtered.sparcc2$group = gsub("Healthy","Control", OTU.table.filtered.sparcc2$group)

otutab <- as.data.frame(t(OTU.table.filtered.sparcc2))

# Select by manual set group
# NPC group
if (TRUE){
  sub_design = subset(design, Group %in% c("Patients")) 
  sub_design$Group  = factor(sub_design$Group, levels=c("Patients"))
}
idx = rownames(sub_design) %in% colnames(otutab)
sub_design_npc = sub_design[idx,]
sub_otutab_npc = otutab[,rownames(sub_design_npc)]
sub_otutab_npc = sub_otutab_npc[-37, ]
sub_otutab_npc <- as.data.frame(sub_otutab_npc)
write.table(sub_otutab_npc, file = "results/Correlation_network_analysis/species_sparcc_p01.txt", 
            row.names = T, sep = "\t", quote = T, col.names = T)

# Healthy group
if (TRUE){
  sub_design = subset(design, Group %in% c("Control")) 
  sub_design$Group  = factor(sub_design$Group, levels=c("Control"))
}
idx = rownames(sub_design) %in% colnames(otutab)
sub_design_healthy = sub_design[idx,]
sub_otutab_healthy = otutab[,rownames(sub_design_healthy)]
sub_otutab_healthy = sub_otutab_healthy[-37, ]
sub_otutab_healthy = as.data.frame(sub_otutab_healthy)
write.table(sub_otutab_healthy, file = "results/Correlation_network_analysis/species_sparcc_h01.txt", 
            row.names = T, sep = "\t", quote = T, col.names = T)

# OTU.table.filtered.sparcc <- OTU.table.filtered.sparcc[, -1]
# We need to transpose the table
# Patients group
#write.table(sub_otutab_npc, file = "results/genus_sparcc_p01_11R_2.tsv", row.names = T, sep = "\t", quote = T, col.names = T)
#write.table(sub_otutab_healthy, file = "results/genus_sparcc_h01_11R_2.tsv", row.names = T, sep = "\t", quote = T, col.names = T)

## 2. SparCC correlation analysis
#!/usr/bin/bash

# install FastSpar
#conda install -c bioconda -c conda-forge fastspar

# Convert CSV file to TSV file
#awk -F ',' 'BEGIN {OFS="\t"} {$1=$1}1' tests/data/sparcc_npc_h01_11R.txt > tests/data/sparcc_npc_h01_11R.tsv
#awk -F ',' 'BEGIN {OFS="\t"} {$1=$1}1' tests/data/sparcc_npc_p01_11R.txt > tests/data/sparcc_npc_p01_11R.tsv

# Control group
# Correlation inference
# test run
#fastspar --otu_table tests/data/fake_data.tsv --correlation median_correlation.tsv --covariance median_covariance.tsv
#fastspar --otu_table tests/data/Species_SparCC_HC_table5.tsv --correlation median_correlation_HC.tsv --covariance median_covariance_HC.tsv

# change iterations
#fastspar --iterations 100 --exclude_iterations 20 --otu_table tests/data/fake_data.tsv --correlation median_correlation.tsv --covariance median_covariance.tsv
#fastspar --iterations 100 --exclude_iterations 20 --otu_table tests/data/Species_SparCC_HC_table5.tsv --correlation median_correlation_HC.tsv --covariance median_covariance_HC.tsv
# change thresholds
#fastspar --threshold 0.1 --otu_table tests/data/fake_data.tsv --correlation median_correlation.tsv --covariance median_covariance.tsv
#fastspar --iterations 100 --threads 10 --threshold 0.1 --otu_table tests/data/Species_SparCC_HC_table5.tsv --correlation median_correlation_HC.tsv --covariance median_covariance_HC.tsv

# combine
#fastspar --iterations 100 --threads 3 --threshold 0.1 --otu_table tests/data/Species_SparCC_HC_table5.tsv --correlation median_correlation_HC.tsv --covariance median_covariance_HC.tsv

# Calculation of exact *p*-values
# First we generate the 1000 bootstrap counts:
#conda install -c intel mkl

#mkdir bootstrap_counts_HC
#fastspar_bootstrap --otu_table tests/data/Species_SparCC_HC_table5.tsv --number 1000 --prefix bootstrap_counts_HC/HC_data

# And then infer correlations for each bootstrap count (running in parallel with all processes available):
#mkdir bootstrap_correlation
#parallel fastspar --otu_table {} --correlation bootstrap_correlation/cor_{/} --covariance bootstrap_correlation/cov_{/} -i 5 ::: bootstrap_counts/*
  
# From these correlations, the *p*-values are then calculated:
#fastspar_pvalues --otu_table tests/data/fake_data.tsv --correlation median_correlation.tsv --prefix bootstrap_correlation/cor_fake_data_ --permutations 1000 --outfile pvalues.tsv

# add threads to accelerate
#fastspar --otu_table tests/data/fake_data.txt --correlation median_correlation.tsv --covariance median_covariance.tsv --iterations 50 --threads 10

# Alternative solutions
# Using sparcc in iNAP online website for analysis, the website is doing really well
# https://inap.denglab.org.cn/


## 3.Visualization based on Gephi software
library(plyr)
library(magrittr)
library(tidyr)
library(dplyr)
library(igraph)
# npc
r.cor <- read.table("data/r.cor.txt", sep="\t", header=T, check.names=F,row.names = 1)
p.cor <- read.table("data/p.cor.txt", sep="\t", header=T, check.names=F,row.names = 1)
r.cor[p.cor>0.05] <- 0

# Build network connection attributes and node attributes
# Convert data to long format for merging and add connection properties
r.cor$from = rownames(r.cor)
p.cor$from = rownames(p.cor)
p_value <-  p.cor %>%
  gather(key = "to", value = "p", -from) %>%
  data.frame() 
p_value$FDR <- p.adjust(p_value$p,"BH")
p_value <- p_value[, -3]
# cor.data<- r.cor %>%
#   gather(key = "to", value = "r", -from) %>%
#   data.frame() %>%
#   left_join(p_value, by=c("from","to")) %>%
#   #diff$p.value <- p.adjust(diff$p.value,"BH")
#   #filter(FDR <= 1e-5, from != to) %>%
#   filter(FDR <= 0.1, from != to) %>%
#   filter(abs(r) >= 0.15, from != to) %>%
#   #filter(p <= 0.001, from != to) %>%
#   plyr::mutate(
#     linecolor = ifelse(r > 0,"positive","negative"),
#     linesize = abs(r)
#   )

cor.data<- r.cor %>%
  gather(key = "to", value = "r", -from) %>%
  data.frame() %>%
  left_join(p_value, by=c("from","to"))
cor.data <- as.data.frame(cor.data)
cor.data <- cor.data[cor.data$FDR <= 0.1 & cor.data$from != cor.data$to, ]
cor.data <- cor.data[abs(cor.data$r) >= 0.15 & cor.data$from != cor.data$to, ]
cor.data <- cor.data %>%
    plyr::mutate(
      linecolor = ifelse(r > 0,"positive","negative"),
      linesize = abs(r)
    )

# Set node properties
vertices <- c(as.character(cor.data$from),as.character(cor.data$to)) %>%
  as_tibble() %>%
  group_by(value) %>%
  clusterProfiler::summarise()
colnames(vertices) <- "name"

# Build graph data structure and add network basic attributes, save data
# Building a graph data structure
graph <- graph_from_data_frame(cor.data, vertices = vertices, directed = FALSE)
E(graph)$weight <- abs(E(graph)$r)
V(graph)$label <- V(graph)$name
# save data
write_graph(graph, "results/Correlation_network_analysis/patients01.graphml", format="graphml")

# Visualized in Gephi software or Cytoscape software
# The same procedure for healthy group

network analysis-2(Spearman Correlation Network Using Gephi Software)

利用Gephi可视化

# Load packages
library(igraph) # Network Analysis and Visualization
library(Hmisc) # Harrell Miscellaneous
library(psych) # Procedures for Psychological, Psychometric, and Personality Research
library(dplyr) # A Grammar of Data Manipulation
library(tidyr) # Tidy Messy Data

conflicts_prefer(ggpubr::mutate)
conflicts_prefer(dplyr::summarise)
conflicts_prefer(plyr::arrange)

mic <- read.table("data/Spearman_network_data.txt", sep="\t", header=T, check.names=F,row.names = 1)
mic = apply(mic, 2, function(x) x/100)
gene <- read.table("data/Spearman_KO_data.txt", sep="\t", header=T, check.names=F,row.names = 1)
group <- read.table("data/group2.txt", sep="\t", header=T, check.names=F)

mic <- as.data.frame(t(mic))
mic$sample <- rownames(mic)
gene <- as.data.frame(t(gene))
gene$sample <- rownames(gene)
df <- merge(mic, gene, by = "sample")
rownames(df) <- df$sample
df <- df[-1]
head(df)
           Species01 Species02 Species03 Species04 Species05 Species06
Patients01 0.0001992   0.0e+00         0  0.00e+00 0.0000000 0.0002268
Patients02 0.0000000   0.0e+00         0  0.00e+00 0.0000300 0.0037749
Patients03 0.0000000   4.7e-06         0  0.00e+00 0.0000000 0.0078025
Patients04 0.0000000   0.0e+00         0  0.00e+00 0.0000000 0.0013194
Patients05 0.0000000   0.0e+00         0  0.00e+00 0.0000000 0.0000000
Patients06 0.0005237   0.0e+00         0  4.71e-05 0.0002594 0.0027443
           Species07 Species08 Species09 Species10 Species11 Species12
Patients01 0.0000000 0.0000000 0.0002184 0.0000000 0.0000000 0.0000449
Patients02 0.0000383 0.0047634 0.0099377 0.0000000 0.0000000 0.0000000
Patients03 0.0003397 0.0000000 0.0000000 0.0000000 0.0008635 0.0121608
Patients04 0.0008964 0.0004650 0.0290622 0.0000466 0.0000000 0.0000000
Patients05 0.0000000 0.0000000 0.0000000 0.0000000 0.0008143 0.0000000
Patients06 0.0000000 0.0008082 0.0009162 0.0009799 0.0006761 0.0001429
           Species13 Species14 Species15 Species16 Species17 Species18
Patients01 0.0088900 0.0030027 0.0484625 0.0535034         0 0.0004194
Patients02 0.0000884 0.0000111 0.1454322 0.0032423         0 0.0000000
Patients03 0.0171689 0.0000000 0.0000363 0.0000000         0 0.0046459
Patients04 0.0006176 0.0035240 0.1092701 0.0085378         0 0.0015125
Patients05 0.0000576 0.0000407 0.0039117 0.0004556         0 0.0003702
Patients06 0.0001377 0.0000526 0.0011158 0.0000696         0 0.0016015
           Species19 Species20 Species21 Species22 Species23 Species24
Patients01  0.000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
Patients02  0.000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
Patients03  0.000996 0.0069058 0.0078448 0.0008736 0.0005681 0.0002662
Patients04  0.000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
Patients05  0.000000 0.0009546 0.0012607 0.0000000 0.0000000 0.0000000
Patients06  0.000034 0.0027267 0.0014984 0.0002432 0.0000965 0.0001687
           Species25 Species26 Species27 Species28 Species29 Species30
Patients01 0.0000000 0.0000000 0.0000000 0.0060147  0.00e+00 0.0000000
Patients02 0.0000000 0.0000000 0.0000000 0.0000217  0.00e+00 0.0000000
Patients03 0.0000000 0.0132018 0.0051837 0.0008164  7.87e-05 0.0000000
Patients04 0.0000000 0.0019413 0.0015781 0.0000000  0.00e+00 0.0000000
Patients05 0.0000000 0.0003286 0.0000000 0.0298093  0.00e+00 0.0003695
Patients06 0.0001864 0.0036324 0.0084354 0.0021779  9.20e-06 0.0000000
           Species31 Species32 Species33 Species34 Species35 Species36
Patients01         0  0.00e+00  0.00e+00 0.0000000 0.0000000 0.0000000
Patients02         0  0.00e+00  0.00e+00 0.0000000 0.0000000 0.0000551
Patients03         0  0.00e+00  0.00e+00 0.0024491 0.0006197 0.0006049
Patients04         0  0.00e+00  0.00e+00 0.0000000 0.0000000 0.0028813
Patients05         0  0.00e+00  0.00e+00 0.0000000 0.0000000 0.0000000
Patients06         0  5.93e-05  1.51e-05 0.0026383 0.0000000 0.0001643
           Species37 Species38 Species39 Species40 Species41 Species42
Patients01 0.0042952         0         0 0.0000000  0.00e+00 0.0000168
Patients02 0.0000000         0         0 0.0066072  0.00e+00 0.0001719
Patients03 0.0000594         0         0 0.0002051  2.90e-06 0.0022513
Patients04 0.0000000         0         0 0.0000000  0.00e+00 0.0001092
Patients05 0.0000000         0         0 0.0033590  0.00e+00 0.0000000
Patients06 0.0003306         0         0 0.0000000  3.33e-05 0.0000000
           Species43 Species44 Species45 Species46 Species47 Species48
Patients01 0.0000000 0.0005825 0.0000000         0 0.0000000  0.00e+00
Patients02 0.0000000 0.0000000 0.0000000         0 0.0000000  0.00e+00
Patients03 0.0016647 0.0159168 0.0039999         0 0.0000000  6.45e-05
Patients04 0.0000000 0.0000000 0.0000000         0 0.0070733  0.00e+00
Patients05 0.0000000 0.0000000 0.0000000         0 0.0306763  0.00e+00
Patients06 0.0000000 0.0014860 0.0007880         0 0.0026179  0.00e+00
           Species49 Species50 Species51 Species52 Species53 Species54
Patients01         0 0.0000000 0.0022000 0.0000000 0.0001471 0.0035067
Patients02         0 0.0000000 0.0000000 0.0000000 0.0000000 0.0244186
Patients03         0 0.0067599 0.0154999 0.0010194 0.0000000 0.0006736
Patients04         0 0.0007462 0.0000000 0.0000000 0.0002468 0.0049033
Patients05         0 0.0011148 0.0000000 0.0029021 0.0120790 0.0742965
Patients06         0 0.0000000 0.0021384 0.0001013 0.0000487 0.0009094
           Species55 Species56 Species57 Species58 Species59 Species60
Patients01 0.0006575 0.0000137 0.0006329 0.0068236 0.0000000 0.0000000
Patients02 0.0066833 0.0006340 0.0007965 0.0031478 0.0001694 0.0411671
Patients03 0.0006155 0.0000000 0.0000000 0.0000000 0.0000000 0.0000824
Patients04 0.0002064 0.0014813 0.0001085 0.0008470 0.0000720 0.0000836
Patients05 0.0001462 0.0006683 0.0000000 0.0035369 0.0014096 0.0092520
Patients06 0.0004803 0.0000000 0.0000000 0.0005778 0.0000297 0.0104302
           Species61 Species62 Species63 Species64 Species65 Species66
Patients01 0.0000000         0 0.0000000 0.0000000 0.0000473 0.0000000
Patients02 0.0000000         0 0.0000000 0.0000000 0.0000000 0.0000000
Patients03 0.0020968         0 0.0031690 0.0001918 0.0059577 0.0080153
Patients04 0.0000000         0 0.0000409 0.0000129 0.0000000 0.0006103
Patients05 0.0000000         0 0.0000000 0.0000000 0.0000000 0.0000000
Patients06 0.0007391         0 0.0000000 0.0000000 0.0000062 0.0000000
           Species67 Species68 Species69 Species70 Species71 Species72
Patients01 0.0000000 0.0102598 0.0014105 0.0000000         0     0e+00
Patients02 0.0000000 0.1318139 0.0000000 0.0024226         0     0e+00
Patients03 0.0059957 0.0300415 0.0509729 0.0000000         0     5e-06
Patients04 0.0000000 0.0012709 0.0000000 0.0000000         0     0e+00
Patients05 0.0000000 0.0353636 0.0000000 0.0000000         0     0e+00
Patients06 0.0000690 0.0002325 0.0058942 0.0000000         0     0e+00
           Species73 Species74 Species75 Species76 Species77 Species78
Patients01 0.0000000 0.0054484         0 0.0000000 0.0000000         0
Patients02 0.0000000 0.0000000         0 0.0000000 0.0000000         0
Patients03 0.0000126 0.0000000         0 0.0000000 0.0004047         0
Patients04 0.0000000 0.0000000         0 0.0000000 0.0000000         0
Patients05 0.0000000 0.0000000         0 0.0000000 0.0000000         0
Patients06 0.0001204 0.0000000         0 0.0001239 0.0000000         0
           Species79 Species80 Species81 Species82 Species83 Species84
Patients01         0 0.0001895  0.00e+00 0.0000000 0.1107871   0.0e+00
Patients02         0 0.0020453  0.00e+00 0.0000569 0.0000000   0.0e+00
Patients03         0 0.0000372  1.96e-05 0.0003917 0.0019890   0.0e+00
Patients04         0 0.0001923  0.00e+00 0.0000000 0.0000000   0.0e+00
Patients05         0 0.0036031  0.00e+00 0.0001910 0.0000000   0.0e+00
Patients06         0 0.0000000  0.00e+00 0.0254755 0.0000000   4.6e-06
           Species85 Species86 Species87 Species88 Species89 Species90
Patients01         0 0.0060479 0.0000000  5.30e-06   1.2e-05 0.0000000
Patients02         0 0.0000000 0.0000000  0.00e+00   0.0e+00 0.0000000
Patients03         0 0.0002256 0.0039633  1.04e-05   0.0e+00 0.0005639
Patients04         0 0.0000000 0.0000000  0.00e+00   0.0e+00 0.0001344
Patients05         0 0.0000000 0.0000000  0.00e+00   0.0e+00 0.0000169
Patients06         0 0.0002721 0.0000000  0.00e+00   0.0e+00 0.0000567
           Species91 Species92 Species93 Species94 Species95 Species96
Patients01 0.0000000 0.0000000 0.0000000         0 0.0024523 0.0000000
Patients02 0.0084149 0.0000000 0.0287016         0 0.0055949 0.0000000
Patients03 0.0000000 0.0030147 0.0000140         0 0.0005346 0.0001082
Patients04 0.0010065 0.0000000 0.0000000         0 0.0000750 0.0000000
Patients05 0.0000000 0.0000000 0.0001670         0 0.0005845 0.0000000
Patients06 0.0000000 0.0000000 0.0000105         0 0.0000000 0.0000175
           Species97 Species98 Species99 Species100 Species101 Species102
Patients01 0.0000000 0.0000000  5.51e-04  0.0009535          0  0.0000000
Patients02 0.0000000 0.0000000  0.00e+00  0.0000000          0  0.0000000
Patients03 0.0000000 0.0062379  8.74e-05  0.0102851          0  0.0005359
Patients04 0.0000000 0.0000000  0.00e+00  0.0000000          0  0.0000000
Patients05 0.0000000 0.0000000  0.00e+00  0.0000000          0  0.0000000
Patients06 0.0007471 0.0022021  0.00e+00  0.0001538          0  0.0000000
           Species103 Species104 Species105 Species106 Species107 Species108
Patients01  0.0000000  0.0001452  0.0025476  0.0000223          0  0.0000000
Patients02  0.0000000  0.0001737  0.0000000  0.0000000          0  0.0000000
Patients03  0.0000348  0.0001332  0.0082329  0.0000000          0  0.0176138
Patients04  0.0000000  0.0000000  0.0317250  0.0238322          0  0.0000000
Patients05  0.0016034  0.0000000  0.0000000  0.0000546          0  0.0000000
Patients06  0.0000000  0.0000000  0.0000784  0.0000000          0  0.0000000
           Species109 Species110 Species111 Species112 Species113 Species114
Patients01  0.0032992          0  0.0000000  0.0027984          0  0.0005566
Patients02  0.0003825          0  0.0000000  0.0221687          0  0.0003348
Patients03  0.0078424          0  0.0509262  0.0093032          0  0.0000767
Patients04  0.0000000          0  0.0007866  0.0006682          0  0.0000000
Patients05  0.0003675          0  0.0170159  0.0194056          0  0.0000000
Patients06  0.0002681          0  0.0012940  0.0041489          0  0.0000000
           Species115 Species116 Species117 Species118 Species119 Species120
Patients01  0.0002689  0.0000000  0.0018668   0.00e+00          0          0
Patients02  0.0000000  0.0002194  0.0000000   0.00e+00          0          0
Patients03  0.0000000  0.0000000  0.0008595   7.08e-05          0          0
Patients04  0.0000000  0.0000000  0.0000000   0.00e+00          0          0
Patients05  0.0000000  0.0000000  0.0000000   0.00e+00          0          0
Patients06  0.0000000  0.0000000  0.0000000   0.00e+00          0          0
             K00001   K00002   K00003   K00004   K00005   K00006   K00007
Patients01 4.85e-07 4.06e-06 1.15e-06 1.95e-06 3.76e-06 0.00e+00 2.38e-06
Patients02 6.72e-06 8.98e-06 0.00e+00 3.08e-06 7.69e-07 2.80e-06 2.47e-05
Patients03 5.42e-07 0.00e+00 2.86e-06 5.00e-06 5.80e-06 0.00e+00 6.66e-06
Patients04 0.00e+00 0.00e+00 0.00e+00 0.00e+00 2.48e-07 0.00e+00 0.00e+00
Patients05 1.71e-06 2.92e-06 6.80e-07 9.30e-07 0.00e+00 9.66e-07 8.15e-06
Patients06 4.28e-06 8.78e-06 0.00e+00 2.83e-06 0.00e+00 2.56e-06 7.07e-06
             K00008   K00009   K00010   K00011   K00012   K00013   K00014
Patients01 0.00e+00 0.00e+00 0.00e+00 0.00e+00 4.60e-06 3.43e-06 0.00e+00
Patients02 1.57e-06 1.42e-05 5.15e-06 1.15e-05 6.37e-06 1.58e-06 6.71e-06
Patients03 0.00e+00 1.26e-06 0.00e+00 2.10e-07 0.00e+00 1.02e-05 0.00e+00
Patients04 0.00e+00 0.00e+00 0.00e+00 5.67e-07 0.00e+00 4.36e-07 0.00e+00
Patients05 6.90e-07 3.82e-06 1.11e-06 1.90e-06 4.16e-07 1.13e-06 0.00e+00
Patients06 3.53e-06 1.53e-05 1.82e-06 9.00e-06 2.71e-06 2.62e-05 3.84e-06
             K00015   K00016   K00017   K00018   K00019   K00020   K00021
Patients01 4.16e-06 0.00e+00 7.39e-06 2.18e-06 0.00e+00 5.63e-07 5.69e-06
Patients02 7.12e-06 6.36e-06 2.17e-07 2.82e-06 1.06e-05 7.09e-06 3.04e-07
Patients03 0.00e+00 0.00e+00 1.41e-06 0.00e+00 0.00e+00 3.12e-07 4.54e-06
Patients04 0.00e+00 0.00e+00 1.08e-05 0.00e+00 0.00e+00 5.37e-06 8.31e-07
Patients05 2.88e-06 1.64e-06 4.78e-07 1.17e-06 2.58e-06 5.91e-06 1.76e-06
Patients06 2.91e-06 3.07e-06 1.09e-06 8.25e-06 3.69e-06 1.35e-05 1.16e-05
             K00022   K00023   K00024   K00025   K00026   K00027   K00028
Patients01 2.36e-07 0.00e+00 5.15e-07 0.00e+00 0.00e+00 0.00e+00 0.00e+00
Patients02 0.00e+00 1.38e-05 2.16e-07 1.48e-05 1.34e-06 7.81e-06 4.42e-06
Patients03 7.92e-06 0.00e+00 1.01e-05 2.60e-07 0.00e+00 2.45e-07 0.00e+00
Patients04 2.48e-07 0.00e+00 2.70e-07 0.00e+00 1.27e-06 0.00e+00 8.89e-08
Patients05 0.00e+00 6.15e-07 9.26e-07 0.00e+00 1.00e-05 2.44e-07 1.95e-06
Patients06 4.52e-06 1.73e-06 2.85e-06 1.61e-06 7.37e-06 2.81e-06 5.69e-06
             K00029   K00030   K00031   K00032   K00033   K00034   K00035
Patients01 1.35e-06 5.17e-06 0.00e+00 2.37e-06 0.00e+00 0.00e+00 0.00e+00
Patients02 0.00e+00 1.67e-06 6.73e-06 0.00e+00 1.39e-06 7.30e-06 2.10e-06
Patients03 3.20e-06 1.47e-05 0.00e+00 1.13e-05 1.44e-07 0.00e+00 0.00e+00
Patients04 0.00e+00 3.29e-07 0.00e+00 0.00e+00 1.03e-05 0.00e+00 0.00e+00
Patients05 4.95e-07 2.16e-06 3.38e-07 1.37e-06 3.98e-06 7.01e-07 2.31e-07
Patients06 1.14e-06 6.32e-06 5.09e-06 1.79e-05 1.20e-05 1.66e-06 1.75e-06
             K00036   K00037   K00038   K00039
Patients01 2.19e-07 2.39e-06 5.54e-07 0.00e+00
Patients02 0.00e+00 0.00e+00 0.00e+00 1.33e-05
Patients03 1.94e-06 4.90e-06 7.85e-06 0.00e+00
Patients04 2.62e-07 2.64e-07 7.76e-06 0.00e+00
Patients05 4.19e-07 0.00e+00 0.00e+00 2.13e-06
Patients06 2.45e-05 0.00e+00 1.69e-05 5.62e-06
# Correlations were calculated and data were processed with p>0.05 as the screening threshold
data<-as.matrix(df)
cor<- corr.test(data, method="spearman",adjust="BH")
data.cor <- as.data.frame(cor$r)

r.cor<-data.frame(cor$r)[1:159,1:159]
p.cor<-data.frame(cor$p)[1:159,1:159]
r.cor[p.cor>0.05] <- 0
# r.cor[abs(r.cor) < 0.2] <- 0

# Keep values with |correlation|≥0.2 and p<0.05
# cor_sparcc_npc[abs(cor_sparcc_npc) < 0.2] <- 0
# 
# pvals_npc[pvals_npc>=0.05] <- -1
# pvals_npc[pvals_npc<0.05 & pvals_npc>=0] <- 1
# pvals_npc[pvals_npc==-1] <- 0

# Filtered adjacency matrix
# adj_npc <- as.matrix(cor_sparcc_npc) * as.matrix(pvals_npc)
# diag(adj_npc) <- 0  #Convert the diagonal values of the correlation matrix (representing autocorrelations) to 0
# write.table(data.frame(adj_npc, check.names = FALSE), 'iNAP_results2/neetwork.adj_npc.txt', col.names = NA, sep = '\t', quote = FALSE)

# Constructing network connection properties and node properties
r.cor$from = rownames(r.cor)
p.cor$from = rownames(p.cor)
p_value <-  p.cor %>%
  gather(key = "to", value = "p", -from) %>%
  data.frame() 
p_value$FDR <- p.adjust(p_value$p,"BH")
p_value <- p_value[, -3]
cor.data<- r.cor %>%
  gather(key = "to", value = "r", -from) %>%
  data.frame() %>%
  left_join(p_value, by=c("from","to")) %>%
  #diff$p.value <- p.adjust(diff$p.value,"BH")
  #filter(FDR <= 1e-5, from != to) %>%
  #filter(p <= 0.001, from != to) %>%
  mutate(
    linecolor = ifelse(r > 0,"positive","negative"),
    linesize = abs(r)
  )
write.csv(cor.data, "results/Correlation_network_analysis/Species_KO_Spearman_correlations.csv")
#cor.data <- cor.data[abs(cor.data$r)>0.2, ]

# Setting node properties
vertices <- c(as.character(cor.data$from),as.character(cor.data$to)) %>%
  as_tibble() %>%
  group_by(value) %>%
  summarise()
colnames(vertices) <- "name"
vertices <- vertices %>%
  left_join(group,by="name")
vertices$group <- factor(vertices$group, levels = c("Species","KO" ))
vertices <- vertices %>%
  arrange(group)

# Constructing graph data structure
graph <- graph_from_data_frame(cor.data, vertices = vertices, directed = FALSE )
E(graph)$weight <- abs(E(graph)$r)
V(graph)$label <- V(graph)$name
# Save data
write_graph(graph, "results/Correlation_network_analysis/Healthy_Spearman_network01.graphml", format="graphml")

# 基于Gephi软件或者Cytoscape软件进行可视化
# Visualization based on Gephi software or Cytoscape software

network analysis-3(Spearman correlation network using igraph)

## 1. data preparation
library(tidyverse)
library(igraph)
library(psych)
### 1.1 Observation-Variable Data Table
data <- read.csv("data/data.csv",header = TRUE,
                  row.names = 1,
                  check.names=FALSE,
                  comment.char = "")

### 1.2 Variable classification table
type = read.csv("data/type.csv",header = TRUE,check.names = FALSE)

## 2. Determine the correlation relationship
### 2.1 Calculate the correlation coefficient. It is recommended to use the spearman coefficient.
cor <- corr.test(data, use = "pairwise",method="spearman",adjust="holm", alpha=.05,ci=FALSE)
cor.r <- data.frame(cor$r) 
cor.p <- data.frame(cor$p)
colnames(cor.r) = rownames(cor.r)
# There are special characters in the variable names, and this code must be run to prevent the matrix row names from being inconsistent with the column names.
colnames(cor.p) = rownames(cor.p) 
write.csv(cor.r,"results/Correlation_network_analysis/cor.r.csv",quote = FALSE,col.names = NA,row.names = TRUE)
write.csv(cor.p,"results/Correlation_network_analysis/cor.p.csv",quote = FALSE,col.names = NA,row.names = TRUE)

knitr::kable(
  head(cor.r),
  caption = "cor.r"
)
cor.r
pH AK AP OM NH4 NO3 Wc Pb Salt Bacillus firmus Bacillus megaterium Bacillus thuringiensis Bacillus subtilis Bacillus subterraneus Bacillus asahii Bacillus pumilus Candidimonas bauzanensis Fictibacillus phosphorivorans Aspergillus niger Fusarium oxysporum Chaetomium globosum Gymnascella hyalinospora Penicillium chrysogenum Fusarium solani Trichocladium asperum Aspergillus ustus Aspergillus terreus Pseudallescheria boydii
pH 1.0000000 -0.6684011 -0.4804946 -0.2438234 -0.1007803 -0.7431736 -0.0331600 0.2223669 0.2906374 0.2360210 0.4208132 0.1801592 -0.5728676 0.2342806 0.1374839 -0.3404478 -0.5884099 -0.1703104 0.1398375 0.1720326 -0.2055286 0.3183329 -0.3480111 -0.2995723 -0.0162813 -0.3694945 0.0717738 -0.3297533
AK -0.6684011 1.0000000 0.7623377 0.7324675 0.2948052 0.4610390 0.1389610 -0.2415584 -0.1246753 -0.3233766 -0.4215655 -0.6297376 0.6154758 -0.0825003 -0.3579862 0.4074773 0.5802067 -0.2235270 -0.5293927 -0.3254304 0.4780773 -0.6043092 0.5231298 -0.0520899 -0.0299187 0.3835584 0.4179111 0.7397974
AP -0.4804946 0.7623377 1.0000000 0.7207792 0.2025974 0.2194805 0.1194805 0.0935065 0.2103896 0.0259740 -0.1292628 -0.7699733 0.4111930 0.2704926 -0.2560826 0.1296519 0.4747146 -0.1055725 -0.2936018 -0.1740825 0.5644690 -0.6752499 0.3765492 -0.1656192 -0.1372360 0.4106172 0.3124190 0.6031986
OM -0.2438234 0.7324675 0.7207792 1.0000000 0.4441558 0.1688312 0.4649351 -0.1311688 0.0441558 -0.1415584 -0.1656382 -0.8030478 0.4249870 0.2055744 -0.3354880 0.2659186 0.3097141 -0.4359754 -0.4923677 -0.3715492 0.5508282 -0.6831322 0.2801318 -0.2457575 0.0637399 0.2130880 0.5004114 0.6613545
NH4 -0.1007803 0.2948052 0.2025974 0.4441558 1.0000000 0.0077922 0.9038961 -0.4246753 0.0064935 -0.2935065 -0.2695681 -0.3267756 0.3317132 0.3015993 -0.3818079 0.2169684 0.2569680 -0.2587179 -0.1994154 -0.4767782 0.1721338 -0.3652130 0.2123790 -0.0213702 0.3700818 0.3077938 0.2840173 0.2515582
NO3 -0.7431736 0.4610390 0.2194805 0.1688312 0.0077922 1.0000000 0.0480519 -0.5220779 -0.5142857 -0.0649351 -0.2312439 -0.1203910 0.6404364 -0.1487710 0.0119108 0.1997697 0.2907796 0.2091900 -0.3104904 -0.3760962 0.3683014 -0.1931162 0.1302939 0.3940134 0.1333335 0.0148823 -0.2312712 0.2569680
#head(cor.p)

## 2.2 Determine the correlation relationship
# Keep the correlation between variables with p<=0.05 and abs(r)>=0.6
# cor.r[abs(cor.r) < 0.6 | cor.p > 0.05] = 0
# cor.r = as.matrix(cor.r)
# g = graph_from_adjacency_matrix(cor.r,mode = "undirected",weighted = TRUE,diag = FALSE)

## Convert the data to long format for filtering. Node and link data are needed to draw the network diagram later. This step can complete the formatting.
cor.r$node1 = rownames(cor.r) 
cor.p$node1 = rownames(cor.p)

r = cor.r %>% 
  gather(key = "node2", value = "r", -node1) %>%
  data.frame()

p = cor.p %>% 
  gather(key = "node2", value = "p", -node1) %>%
  data.frame()
#head(r)
#head(p)

## Combine r and p values into one data table
cor.data <- merge(r,p,by=c("node1","node2"))
#head(cor.data)

## Keep the correlation between variables with p<=0.05 and abs(r)>=0.6, and add network attributes
cor.data <- cor.data %>%
  filter(abs(r) >= 0.6, p <= 0.05, node1 != node2) %>%
  mutate(
    linetype = ifelse(r > 0,"positive","negative"), 
    linesize = abs(r) 
    ) 
#head(cor.data)

## 3. Constructing the network graph data structure
# After building the network in this step, you also need to convert the network graph into a simple graph and remove duplicate links.
### 3.1 Network graph node attribute arrangement
#### Calculate the number of links each node has
conflicts_prefer(dplyr::summarize)
c(as.character(cor.data$node1),as.character(cor.data$node2)) %>%
  as_tibble() %>%
  group_by(value) %>%
  summarize(n=n()) -> vertices
colnames(vertices) <- c("node", "n")
#head(vertices)

#### Add variable classification attributes
vertices %>%
  select(-n) %>% 
  left_join(type,by="node") -> vertices 

#### The nodes in the network diagram will be drawn in sequence according to the order of the node attribute file. In order to make the variables of the same type close together, the nodes are sorted according to the node attributes.
vertices$type = factor(vertices$type,levels = unique(vertices$type))
vertices = arrange(vertices,type)
write.csv(vertices,"results/Correlation_network_analysis/vertices.csv",quote = FALSE,col.names = NA,row.names = FALSE)
head(vertices)
# A tibble: 6 × 2
  node  type                       
  <chr> <fct>                      
1 AK    Soil physi-chemical factors
2 AP    Soil physi-chemical factors
3 NH4   Soil physi-chemical factors
4 NO3   Soil physi-chemical factors
5 OM    Soil physi-chemical factors
6 Wc    Soil physi-chemical factors
### 3.2 Build graph structure data
g <- graph_from_data_frame(cor.data, vertices = vertices, directed = FALSE )
#g
vcount(g) 
[1] 21
ecount(g) 
[1] 42
#get.vertex.attribute(g) # View the node attributes contained in the graph
#get.edge.attribute(g) # View the link properties contained in the graph


## 3.3 Simple Graph
is.simple(g) # For non-simple graphs, the number of links will be too high, so it needs to be converted to a simple graph
[1] FALSE
E(g)$weight <- 1
g <- igraph::simplify(g,
                      remove.multiple = TRUE,
                      remove.loops = TRUE,
                      edge.attr.comb = "first")
# g <- delete.vertices(g,which(degree(g) == 0)) # Delete isolated points
is.simple(g)
[1] TRUE
E(g)$weight <- 1
is.weighted(g)
[1] TRUE
vcount(g)
[1] 21
ecount(g)
[1] 33
### 3.4 Calculating the number of node links
conflicts_prefer(igraph::degree)
V(g)$degree <- degree(g)
#vertex.attributes(g)
#edge.attributes(g) 
#g

### 3.5 Save the graph locally
# Directly save the graph structure. GML can save the most graph information.
write.graph(g,file = "results/Correlation_network_analysis/all.gml",format="gml") 
net.data  <- igraph::as_data_frame(g, what = "both")$edges # Extract link properties
write.csv(net.data,"results/Correlation_network_analysis/net.data.csv",quote = FALSE,col.names = NA,row.names = FALSE) 
head(net.data)
  from                       to          r            p linetype  linesize
1   AK                       AP  0.7623377 0.0220369496 positive 0.7623377
2   AK                       OM  0.7324675 0.0001595923 positive 0.7324675
3   AK                       pH -0.6684011 0.0009260728 negative 0.6684011
4   AK Gymnascella hyalinospora -0.6043092 0.0037140827 negative 0.6043092
5   AK  Pseudallescheria boydii  0.7397974 0.0469368082 positive 0.7397974
6   AK        Bacillus subtilis  0.6154758 0.0029781355 positive 0.6154758
  weight
1      1
2      1
3      1
4      1
5      1
6      1
vertices  <- igraph::as_data_frame(g, what = "both")$vertices # Extracting node attributes
write.csv(vertices,"results/Correlation_network_analysis/vertices.csv",quote = FALSE,col.names = NA,row.names = FALSE)
# head(vertices) 
# Directly read previously saved link and node attribute files, and then directly generate graphs or use them for drawing in other drawing software.

## 4. Draw a group network diagram
### 4.1 Preparing network diagram layout data
#?layout_in_circle
layout1 <- layout_in_circle(g) 
layout2 <- layout_with_fr(g)
layout3 <- layout_on_grid(g)
layout4 <- layout_nicely(g)
layout5 <- layout_with_graphopt(g)
#head(layout1)

### 4.2 Setting the drawing color
#?rgb()

### Set the background color of nodes and groups
# color <- c(rgb(65,179,194,maxColorValue = 255),
#          rgb(255,255,0,maxColorValue = 255),
#          rgb(201,216,197,maxColorValue = 255))
color <- c("#5ebcc2","#ce77ad","#879b56")
names(color) <- unique(V(g)$type) # Name the color based on the node classification attribute
V(g)$point.col <- color[match(V(g)$type,names(color))] # Set node color
# names(color2) <- unique(V(g)$type) 
# If you want the node color to be different from the background color, you can set a separate color set for the node.
# V(g)$point.col <- color2[match(V(g)$type,names(color2))]

#### The edge color is set according to the positive or negative correlation
# E(g)$color <- ifelse(E(g)$linetype == "positive",rgb(255,215,0,maxColorValue = 255),"gray50")
E(g)$color <- ifelse(E(g)$linetype == "positive","red",rgb(0,147,0,maxColorValue = 255))
# E(g)$color = ifelse(E(g)$r>0,rgb(254/255,67/255,101/255,abs(E(g)$r)),rgb(0/255,0/255,255/255,abs(E(g)$r)))

### 4.3 Draw a radial layout network diagram
pdf("results/Correlation_network_analysis/network_group_circle.pdf",family = "Times",width = 10,height = 12)
par(mar=c(5,2,1,2))
plot.igraph(g, layout=layout1,
     ##Node color setting parameters
     vertex.color=V(g)$point.col,
     vertex.frame.color ="black",
     vertex.border=V(g)$point.col,
     ##Node size setting parameters
     shape = 1,# Set point shape
     vertex.size=V(g)$degree*2, 
     rescale =TRUE, 
     ##Node label setting parameters
     vertex.label=g$name,
     vertex.label.cex=0.8,
     ## The distance between the label and the node center. 0 means the label is at the node center.
     vertex.label.dist=0, 
     ## Labels for node positions, 0-right, pi-left, -pi/2-top, pi/2-bottom.
     vertex.label.col="black",
     ## Set the node group list and draw the group background color
     mark.groups =list(V(g)$name[V(g)$type %in% names(color)[1]],
                       V(g)$name[V(g)$type %in% names(color)[2]],
                       V(g)$name[V(g)$type %in% names(color)[3]]
                       ), 
     mark.col=color,
     mark.border=color,
     ## Link attribute parameters start with edge*
     edge.arrow.size=0.5,
     edge.width=abs(E(g)$r)*2,
     edge.curved = TRUE
     )
## Set legend
legend(
  title = "Type",
  list(x = min(layout1[,1])-0.2,
       y = min(layout1[,2])-0.17), # The position of the legend needs to be adjusted according to your own data
  legend = c(unique(V(g)$type)),
  fill = color,
  #pch=1
)

legend(
  title = "|r-value|",
  list(x = min(layout1[,1])+0.4,
       y = min(layout1[,2])-0.17),
  legend = c(0.6,0.8,1.0),
  col = "black",
  lty=1,
  lwd=c(0.6,0.8,1.0)*2,
)

legend(
  title = "Correlation (±)",
  list(x = min(layout1[,1])+0.8,
       y = min(layout1[,2])-0.17),
  legend = c("positive","negative"),
  col = c("red",rgb(0,147,0,maxColorValue = 255)),
  lty=1,
  lwd=1
)

legend(
  title = "Degree",
  list(x = min(layout1[,1])+1.2,
       y = min(layout1[,2])-0.17),
  legend = c(1,seq(0,8,2)[-1]),# V(g)$degree。
  col = "black",
  pch=1,
  pt.lwd=1,
  pt.cex=c(1,seq(0,8,2)[-1]) 
)
dev.off()
png 
  2 
### 4.4 Draw fr layout network diagram-no background color
pdf("results/Correlation_network_analysis/network_group_fr.pdf",family = "Times",width = 10,height = 12)
par(mar=c(5,2,1,2))
plot.igraph(g, layout=layout2,
     vertex.color=V(g)$point.col,
     vertex.frame.color ="black",
     vertex.border=V(g)$point.col,
     vertex.size=V(g)$degree*2,
     vertex.label=g$name,
     vertex.label.cex=0.8,
     vertex.label.col="black",
     edge.arrow.size=0.5,
     edge.width=abs(E(g)$r)*2,
     edge.curved = TRUE
     )
legend(
  title = "Type",
  list(x = min(layout1[,1])-0.2,
       y = min(layout1[,2])-0.17), 
  legend = c(unique(V(g)$type)),
  fill = color,
  #pch=1
)
legend(
  title = "|r-value|",
  list(x = min(layout1[,1])+0.4,
       y = min(layout1[,2])-0.17),
  legend = c(0.6,0.8,1.0),
  col = "black",
  lty=1,
  lwd=c(0.6,0.8,1.0)*2,
)
legend(
  title = "Correlation (±)",
  list(x = min(layout1[,1])+0.8,
       y = min(layout1[,2])-0.17),
  legend = c("positive","negative"),
  col = c("red",rgb(0,147,0,maxColorValue = 255)),
  lty=1,
  lwd=1
)
legend(
  title = "Degree",
  list(x = min(layout1[,1])+1.2,
       y = min(layout1[,2])-0.17),
  legend = c(1,seq(0,8,2)[-1]),# max(V(g)$degree)
  col = "black",
  pch=1,
  pt.lwd=1,
  pt.cex=c(1,seq(0,8,2)[-1])
)
dev.off()
png 
  2 
### 4.5 Draw a graphopt layout network diagram - without adding background color
pdf("results/Correlation_network_analysis/network_group_graphopt.pdf",family = "Times",width = 10,height = 12)
par(mar=c(5,2,1,2))
plot.igraph(g, layout=layout5,
     vertex.color=V(g)$point.col,
     #vertex.frame.color ="black",
     vertex.border=V(g)$point.col,
     #vertex.size=V(g)$degree*2,
     vertex.size=6,
     vertex.frame.color="white",
     vertex.label=g$name,
     vertex.label.cex=0.8,
     vertex.label.dist=0, 
     vertex.label.degree = pi/2,
     vertex.label.col="black",
     #vertex.frame.color="transparent",
     edge.arrow.size=0.5,
     edge.width=abs(E(g)$r)*6,
     #edge.curved = TRUE,
     #edge.label.x = # 
     #edge.label.y = # 
     )

legend(
  title = "Type",
  list(x = min(layout1[,1])-0.2,
       y = min(layout1[,2])-0.17),
  legend = c(unique(V(g)$type)),
  fill = color,
  #pch=1
)
legend(
  title = "|r-value|",
  list(x = min(layout1[,1])+0.4,
       y = min(layout1[,2])-0.17),
  legend = c(0.6,0.8,1.0),
  col = "black",
  lty=1,
  lwd=c(0.6,0.8,1.0)*2,
)
legend(
  title = "Correlation (±)",
  list(x = min(layout1[,1])+0.8,
       y = min(layout1[,2])-0.17),
  legend = c("positive","negative"),
  col = c("red",rgb(0,147,0,maxColorValue = 255)),
  lty=1,
  lwd=1
)
legend(
  title = "Degree",
  list(x = min(layout1[,1])+1.2,
       y = min(layout1[,2])-0.17),
  legend = c(1,seq(0,8,2)[-1]),# max(V(g)$degree)
  col = "black",
  pch=1,
  pt.lwd=1,
  pt.cex=c(1,seq(0,8,2)[-1])
)
dev.off()
png 
  2 
### 4.6 Set the edge gradient color to indicate the strength of the correlation. The stronger the correlation, the darker the color.
#?layout_in_circle
layout1 <- layout_in_circle(g) 
layout2 <- layout_with_fr(g) 
layout3 <- layout_on_grid(g)
layout4 <- layout_nicely(g)
layout5 <- layout_with_graphopt(g)
#head(layout1)

#?rgb() 。
color <- c("#5ebcc2","#ce77ad","#879b56")

names(color) <- unique(V(g)$type)
V(g)$point.col <- color[match(V(g)$type,names(color))] 
#names(color2) <- unique(V(g)$type) 
#V(g)$point.col <- color2[match(V(g)$type,names(color2))]

#E(g)$color <- ifelse(E(g)$linetype == "positive",rgb(255,215,0,maxColorValue = 255),"gray50")
#E(g)$color <- ifelse(E(g)$linetype == "positive","red",rgb(0,147,0,maxColorValue = 255))
E(g)$color = ifelse(E(g)$r>0,rgb(254/255,67/255,101/255,abs(E(g)$r)),rgb(0/255,0/255,255/255,abs(E(g)$r)))

pdf("results/Correlation_network_analysis/network_group_graphopt3.pdf",family = "Times",width = 10,height = 12)
par(mar=c(5,2,1,2))
plot.igraph(g, layout=layout5,
     vertex.color=V(g)$point.col,
     #vertex.frame.color ="black",
     vertex.border=V(g)$point.col,
     #vertex.size=V(g)$degree*2,
     vertex.size=6,
     vertex.frame.color="white",
     vertex.label=g$name,
     vertex.label.cex=0.8,
     vertex.label.dist=0, 
     vertex.label.degree = pi/2, 
     vertex.label.col="black",
     #vertex.frame.color="transparent",
     edge.arrow.size=0.5,
     edge.width=abs(E(g)$r)*6,
     #edge.curved = TRUE,
     #edge.label.x = # 
     #edge.label.y = # 
     )

legend(
  title = "Type",
  list(x = min(layout1[,1])-0.2,
       y = min(layout1[,2])-0.17),
  legend = c(unique(V(g)$type)),
  fill = color,
  #pch=1
)
dev.off()
png 
  2 
pdf("results/Correlation_network_analysis/network_group_graphopt3_legend.pdf",family = "Times",width = 10,height = 3)
color_legend = c(rgb(254/255,67/255,101/255,seq(1,0,by=-0.01)),rgb(0/255,0/255,255/255,seq(0,1,by=0.01)))
par(mar=c(2,2,1,2),xpd = T,cex.axis=1.6,las=1)
barplot(rep(1,length(color_legend)),border = NA, space = 0,ylab="",xlab="",xlim=c(1,length(color_legend)),horiz=FALSE,
        axes = F, col=color_legend,main="")
axis(3,at=seq(1,length(color_legend),length=5),c(1,0.5,0,-0.5,-1),tick=FALSE)
dev.off()
png 
  2 

network stability analysis(利用ggClusterNet完成网络分析与对比)

参考:https://mp.weixin.qq.com/s/qJHibDjtbEqpqgsfrYdETw

# install.packages("BiocManager")
# library(BiocManager)
# install("remotes")
# install("tidyverse")
library(tidyverse)
# install("tidyfst")
library(tidyfst)
# install("igraph")
library(igraph)
# install("sna")
library(sna)
# install("phyloseq")
library(phyloseq)
# install("ggalluvial")
library(ggalluvial)
# install("ggraph")
library(ggraph)
# install("WGCNA")
library(WGCNA)
# install("ggnewscale")
library(ggnewscale)
# install("pulsar")
library(pulsar)
# install("patchwork")
library(patchwork)
# remotes::install_github("taowenmicro/EasyStat")
library(EasyStat)
# remotes::install_github("taowenmicro/ggClusterNet")

library(ggClusterNet)
library(phyloseq)
library(tidyverse)
library(igraph)
library(tidyfst)

#netpath = paste(otupath,"/network_stab/",sep = "")
#dir.create(netpath)

# Reading the raw file(原始文件读取)
metadata = read.delim("data/data.ps/metadata.tsv")
row.names(metadata) = metadata$SampleID
otutab = read.table("data/data.ps/otutab.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
taxonomy = read.table("data/data.ps/taxonomy.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
library(ggtree)
tree = read.tree("data/data.ps/otus.tree")
library(Biostrings)
rep = readDNAStringSet("data/data.ps/otus.fa")
# Import phyloseq package(导入phyloseq(ps)R包)
library(phyloseq)
ps = phyloseq(sample_data(metadata),
              otu_table(as.matrix(otutab), taxa_are_rows=TRUE), 
              tax_table(as.matrix(taxonomy)),
              phy_tree(tree),
              refseq(rep)
              )
ps
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 2432 taxa and 18 samples ]
sample_data() Sample Data:       [ 18 samples by 11 sample variables ]
tax_table()   Taxonomy Table:    [ 2432 taxa by 7 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 2432 tips and 2431 internal nodes ]
refseq()      DNAStringSet:      [ 2432 reference sequences ]
# Main function, used for network calculation and visualization
library(ggClusterNet)
library(phyloseq)
library(tidyverse)
data(ps)
otupath = "./"
# Set work directory
netpath = paste(otupath,"results/Correlation_network_analysis/network.new/",sep = "")
dir.create(netpath)

rank.names(ps)
[1] "Kingdom" "Phylum"  "Class"   "Order"   "Family"  "Genus"   "Species"
library(ggrepel)
library(igraph)
#detach("package:MicrobiotaProcess")

# Network analysis main function
tab.r = network.pip(
  ps = ps,
  N = 400,
  # ra = 0.05,
  big = TRUE,
  select_layout = FALSE,
  layout_net = "model_maptree2",
  r.threshold = 0.6,
  p.threshold = 0.05,
  maxnode = 2,
  method = "spearman",
  label = TRUE,
  lab = "elements",
  group = "Group",
  fill = "Phylum",
  size = "igraph.degree",
  zipi = TRUE,
  ram.net = TRUE,
  clu_method = "cluster_fast_greedy",
  step = 100,
  R=10,
  ncpus = 1
)
[1] "KO"
[1] "cor matrix culculating over"
[1] "OE"
[1] "cor matrix culculating over"
[1] "WT"
[1] "cor matrix culculating over"
# It is recommended to save the output results as R objects to save time by not performing correlation matrix calculations later.
saveRDS(tab.r,paste0(netpath,"network.pip.sparcc.rds"))
tab.r = readRDS(paste0(netpath,"network.pip.sparcc.rds"))
dat = tab.r[[2]]
cortab = dat$net.cor.matrix$cortab

# It is not easy to run a large correlation matrix, so it is recommended to save it to facilitate the calculation of various network properties.
saveRDS(cortab,paste0(netpath,"cor.matrix.all.group.rds"))
cor = readRDS(paste0(netpath,"cor.matrix.all.group.rds"))

# Extract the storage objects of all images
plot = tab.r[[1]]
# Extract network graph visualization results
p0 = plot[[1]]

ggsave(paste0(netpath,"plot.network.pdf"),p0,width = 12,height = 5)
ggsave(paste0(netpath,"plot.network2.pdf"),p0,width = 16,height = 10)

# zipi display
plot[[2]]

# Comparison with random networks
plot[[3]]

## Network attribute calculation - rich network attributes
i = 1
id = names(cor)
for (i in 1:length(id)) {
  igraph= cor[[id[i]]] %>% make_igraph()
  dat = net_properties.4(igraph,n.hub = F)
  head(dat,n = 16)
  colnames(dat) = id[i]

  if (i == 1) {
    dat2 = dat
  } else{
    dat2 = cbind(dat2,dat)
  }
}
head(dat2)
                          KO                    OE                   
num.edges(L)              "348"                 "261"                
num.pos.edges             "204"                 "174"                
num.neg.edges             "144"                 "87"                 
num.vertices(n)           "279"                 "273"                
Connectance(edge_density) "0.00897346638817978" "0.00702973497091144"
average.degree(Average K) "2.49462365591398"    "1.91208791208791"   
                          WT                   
num.edges(L)              "258"                
num.pos.edges             "151"                
num.neg.edges             "107"                
num.vertices(n)           "278"                
Connectance(edge_density) "0.00670077656286523"
average.degree(Average K) "1.85611510791367"   
FileName <- paste(netpath,"net.network.attribute.data.csv", sep = "")
write.csv(dat2,FileName,quote = F)

## Node attribute calculation
for (i in 1:length(id)) {
  igraph= cor[[id[i]]] %>% make_igraph()
  nodepro = node_properties(igraph) %>% as.data.frame()
  nodepro$Group = id[i]
  head(nodepro)
  colnames(nodepro) = paste0(colnames(nodepro),".",id[i])
  nodepro = nodepro %>%
    as.data.frame() %>%
    rownames_to_column("ASV.name")
  # head(dat.f)
  if (i == 1) {
    nodepro2 = nodepro
  } else{
    nodepro2 = nodepro2 %>% full_join(nodepro,by = "ASV.name")
  }
}
head(nodepro2)
  ASV.name igraph.degree.KO igraph.closeness.KO igraph.betweenness.KO
1  ASV_180                2                   1                     0
2  ASV_378                2                   1                     0
3   ASV_20                4                   1                     0
4  ASV_193                4                   1                     0
5  ASV_309                4                   1                     0
6  ASV_417                4                   1                     0
  igraph.cen.degree.KO Group.KO igraph.degree.OE igraph.closeness.OE
1                    2       KO                3                   1
2                    2       KO               NA                  NA
3                    4       KO               NA                  NA
4                    4       KO               NA                  NA
5                    4       KO               NA                  NA
6                    4       KO                1                   1
  igraph.betweenness.OE igraph.cen.degree.OE Group.OE igraph.degree.WT
1                     0                    3       OE                1
2                    NA                   NA     <NA>               NA
3                    NA                   NA     <NA>                4
4                    NA                   NA     <NA>                2
5                    NA                   NA     <NA>                2
6                     0                    1       OE               NA
  igraph.closeness.WT igraph.betweenness.WT igraph.cen.degree.WT Group.WT
1                   1                     0                    1       WT
2                  NA                    NA                   NA     <NA>
3                   1                     0                    4       WT
4                   1                     0                    2       WT
5                   1                     0                    2       WT
6                  NA                    NA                   NA     <NA>
FileName <- paste(netpath,"net.node.attribute.data.sample.csv", sep = "")
write_csv(nodepro2,FileName)

## Customizable network output
dat = tab.r[[2]]
node = dat$net.cor.matrix$node
edge = dat$net.cor.matrix$edge
head(edge)
# A tibble: 6 × 12
     X2    Y2 OTU_2   OTU_1   weight    X1    Y1 cor   group Group nodes label  
  <dbl> <dbl> <chr>   <chr>    <dbl> <dbl> <dbl> <chr> <chr> <chr> <int> <fct>  
1 -37.9 -3.41 ASV_126 ASV_431     -1 -35.8 -4.83 -     WT    WT      278 WT: (n…
2 -37.9 -3.41 ASV_126 ASV_330      1 -38.0 -5.34 +     WT    WT      278 WT: (n…
3 -37.9 -3.41 ASV_126 ASV_380      1 -36.3 -2.93 +     WT    WT      278 WT: (n…
4 -35.8 -4.83 ASV_431 ASV_330     -1 -38.0 -5.34 -     WT    WT      278 WT: (n…
5 -35.8 -4.83 ASV_431 ASV_380     -1 -36.3 -2.93 -     WT    WT      278 WT: (n…
6 -38.0 -5.34 ASV_330 ASV_380      1 -36.3 -2.93 +     WT    WT      278 WT: (n…
head(node)
          X1         X2 elements igraph.degree igraph.closeness
1  10.907726  -9.455050    ASV_1             4                1
2   6.602809 -18.194949   ASV_10             2                1
3 -28.228285   5.342999  ASV_100             0                0
4   3.737447 -29.250730  ASV_101             1                1
5 -27.083427   3.213331  ASV_102             1                1
6 -23.166028   5.072746  ASV_103             1                1
  igraph.betweenness igraph.cen.degree group      ID  Kingdom         Phylum
1                  0                 4    KO   ASV_1 Bacteria Actinobacteria
2                  0                 2    KO  ASV_10 Bacteria Proteobacteria
3                  0                 0    KO ASV_100 Bacteria Proteobacteria
4                  0                 1    KO ASV_101 Bacteria Proteobacteria
5                  0                 1    KO ASV_102 Bacteria Proteobacteria
6                  0                 1    KO ASV_103 Bacteria Proteobacteria
                Class           Order              Family           Genus
1      Actinobacteria Actinomycetales Thermomonosporaceae      Unassigned
2 Alphaproteobacteria     Rhizobiales        Rhizobiaceae       Rhizobium
3  Betaproteobacteria Burkholderiales      Comamonadaceae  Hydrogenophaga
4          Unassigned      Unassigned          Unassigned      Unassigned
5 Gammaproteobacteria      Unassigned          Unassigned      Unassigned
6 Alphaproteobacteria     Rhizobiales  Phyllobacteriaceae Phyllobacterium
                       Species Group nodes                     label
1                   Unassigned    KO   279 KO: (nodes: 279; links: )
2                   Unassigned    KO   279 KO: (nodes: 279; links: )
3    Hydrogenophaga_intermedia    KO   279 KO: (nodes: 279; links: )
4                   Unassigned    KO   279 KO: (nodes: 279; links: )
5                   Unassigned    KO   279 KO: (nodes: 279; links: )
6 Phyllobacterium_bourgognense    KO   279 KO: (nodes: 279; links: )
#node2  = add.id.facet(node,"Group")
#head(node2)

p <- ggplot() + geom_segment(aes(x = X1, y = Y1, xend = X2, yend = Y2,color = cor),
                             data = edge, size = 0.03,alpha = 0.1) +
  geom_point(aes(X1, X2,
                 fill = Phylum,
                 size = igraph.degree),
             pch = 21, data = node,color = "gray40") +
  facet_wrap(.~ label,scales="free_y",nrow = 1) +
  # geom_text_repel(aes(X1, X2,label = elements),pch = 21, data = nodeG) +
  # geom_text(aes(X1, X2,label = elements),pch = 21, data = nodeG) +
  scale_colour_manual(values = c("#6D98B5","#D48852")) +
  # scale_fill_hue()+
  scale_size(range = c(0.8, 5)) +
  scale_x_continuous(breaks = NULL) +
  scale_y_continuous(breaks = NULL) +
  theme(panel.background = element_blank(),
        plot.title = element_text(hjust = 0.5)
  ) +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank()
  ) +
  theme(legend.background = element_rect(colour = NA)) +
  theme(panel.background = element_rect(fill = "white",  colour = NA)) +
  theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank())
p

# Zipi Visualization-Customization
dat.z = dat$zipi.data
head(dat.z)
           z module p       roles label           role_7 group      id
KO.ASV_180 0     42 0 Peripherals       ultra peripheral    KO ASV_180
KO.ASV_378 0     42 0 Peripherals       ultra peripheral    KO ASV_378
KO.ASV_2   0     42 0 Peripherals       ultra peripheral    KO   ASV_2
KO.ASV_20  0     10 0 Peripherals       ultra peripheral    KO  ASV_20
KO.ASV_193 0     10 0 Peripherals       ultra peripheral    KO ASV_193
KO.ASV_309 0     10 0 Peripherals       ultra peripheral    KO ASV_309
x1<- c(0, 0.62,0,0.62)
x2<- c( 0.62,1,0.62,1)
y1<- c(-Inf,2.5,2.5,-Inf)
y2 <- c(2.5,Inf,Inf,2.5)
lab <- c("peripheral",'Network hubs','Module hubs','Connectors')
roles.colors <- c("#E6E6FA","#DCDCDC","#F5FFFA", "#FAEBD7")
tab = data.frame(x1 = x1,y1 = y1,x2 = x2,y2 = y2,lab = lab)
tem = dat.z$group %>% unique() %>% length()
for ( i in 1:tem) {
  if (i == 1) {
    tab2 = tab
  } else{
    tab2 = rbind(tab2,tab)
  }
}

p <- ggplot() +
  geom_rect(data=tab2,
            mapping=aes(xmin=x1,
                        xmax=x2,
                        ymin=y1,
                        ymax=y2,
                        fill = lab))+
  guides(fill=guide_legend(title="Topological roles")) +
  scale_fill_manual(values = roles.colors)+
  geom_point(data=dat.z,aes(x=p, y=z,color=module)) + theme_bw()+
  guides(color= F) +
  ggrepel::geom_text_repel(data = dat.z,
                           aes(x = p, y = z,
                               color = module,label=label),size=4)+
  # facet_wrap(.~group) +
  facet_grid(.~ group, scale='free') +
  theme(strip.background = element_rect(fill = "white"))+
  xlab("Participation Coefficient")+ylab(" Within-module connectivity z-score")
p

# Random networks, power-law distribution
dat.r = dat$random.net.data
p3 <- ggplot(dat.r) +
  geom_point(aes(x = ID,y = network,
                 group =group,fill = group),pch = 21,size = 2) +
  geom_smooth(aes(x = ID,y = network,group =group,color = group))+
  facet_grid(.~g,scales = "free") +
  theme_bw() + theme(
    plot.margin=unit(c(0,0,0,0), "cm")
  )
p3

# Multi-network comparison-network saliency
dat = module.compare.net.pip(
  ps = NULL,
  corg = cor,
  degree = TRUE,
  zipi = FALSE,
  r.threshold= 0.8,
  p.threshold=0.05,
  method = "spearman",
  padj = F,
  n = 3)
res = dat[[1]]
head(res)
      module1 module2 Both P1A2 P2A1 A1A2     p_raw     p_adj
Var1       KO      OE  165  114  108   59 0.8965031 0.8965031
Var11      KO      WT  160  119  118   49 0.9983120 0.9983120
Var12      OE      WT  174   99  104   69 0.2515650 0.2515650
FileName <- paste(netpath,"net.compare.diff.sig.csv", sep = "")
write.csv(res,FileName,quote = F)

## Network stability-module comparison
library(tidyfst)
res1 = module.compare.m(
  ps = NULL,
  corg = cor,
  zipi = FALSE,
  zoom = 0.2,
  padj = F,
  n = 3)
[1] 36  8
[1] 67  8
[1] 86  8
[1] 0
[1] 1
[1] 2
[1] 3
[1] 0
[1] 1
[1] 2
[1] 3
[1] 4
[1] 0
[1] 1
[1] 2
[1] 3
[1] 4
# Different groups are shown using a circle. A dot in the circle represents a module. Connected modules represent similar modules. p1 = res1[[1]]
p1

# Extract the corresponding information of OTU, grouping, etc. of the module
dat1 = res1[[2]]
head(dat1)
       ID     group Group
1  ASV_62 KOmodel_1    KO
2  ASV_55 KOmodel_1    KO
3 ASV_122 KOmodel_1    KO
4 ASV_224 KOmodel_1    KO
5 ASV_248 KOmodel_1    KO
6 ASV_395 KOmodel_1    KO
# Module similarity result table
dat2 = res1[[3]]
head(dat2)
                        module1    module2 Both P1A2 P2A1 A1A2
KOmodel_32-OEmodel_4 KOmodel_32  OEmodel_4    1    2    4  380
KOmodel_40-OEmodel_4 KOmodel_40  OEmodel_4    1    2    4  380
KOmodel_1-OEmodel_12  KOmodel_1 OEmodel_12    2    5    2  378
KOmodel_19-OEmodel_9 KOmodel_19  OEmodel_9    1    3    3  380
KOmodel_25-OEmodel_8 KOmodel_25  OEmodel_8    1    3    3  380
KOmodel_28-OEmodel_8 KOmodel_28  OEmodel_8    1    2    3  381
                                   p_raw               p_adj
KOmodel_32-OEmodel_4  0.0383590783638981  0.0383590783638981
KOmodel_40-OEmodel_4  0.0383590783638981  0.0383590783638981
KOmodel_1-OEmodel_12 0.00165785308197045 0.00165785308197045
KOmodel_19-OEmodel_9  0.0408633515973255  0.0408633515973255
KOmodel_25-OEmodel_8  0.0408633515973255  0.0408633515973255
KOmodel_28-OEmodel_8  0.0307671763509816  0.0307671763509816
dat2$m1 = dat2$module1 %>% strsplit("model") %>%
  sapply(`[`, 1)
dat2$m2 = dat2$module2 %>% strsplit("model") %>%
  sapply(`[`, 1)
dat2$cross = paste(dat2$m1,dat2$m2,sep = "_Vs_")
# head(dat2)
dat2 = dat2 %>% filter(module1 != "none")

p2 = ggplot(dat2) + geom_bar(aes(x = cross,fill = cross)) +
  labs(x = "",
       y = "numbers.of.similar.modules"
  )+ theme_classic()
p2

# It is found that the networks of group 1 and group 3 are more similar
FileName <- paste(netpath,"module.compare.groups.pdf", sep = "")
ggsave(FileName, p1, width = 10, height = 10)

FileName <- paste(netpath,"numbers.of.similar.modules.pdf", sep = "")
ggsave(FileName, p2, width = 8, height = 8)

FileName <- paste(netpath,"module.otu.csv", sep = "")
write.csv(dat1,FileName, quote = F)

FileName <- paste(netpath,"module.compare.groups.csv", sep = "")
write.csv(dat2,FileName, quote = F)

## Network stability-robustness
# Robust calculations require species richness, so even if the correlation matrix is calculated, a ps object must be entered
# Network stability - Removing key nodes - Network robustness
library(patchwork)
conflicts_prefer(dplyr::desc)
conflicts_prefer(ggplot2::theme_light)
res2= Robustness.Targeted.removal(ps = ps,
                                  corg = cor,
                                  degree = TRUE,
                                  zipi = FALSE
)
p3 = res2[[1]]
p3

# Extracting data
dat4 = res2[[2]]

# dir.create("./Robustness_Random_removal/")
path = paste(netpath,"/Robustness_Random_removal/",sep = "")
fs::dir_create(path)
write.csv(dat4,
          paste(path,"random_removal_network.csv",sep = ""))
ggsave(paste(path,"random_removal_network.pdf",sep = ""),  p3,width = 8,height = 4)


# N etwork stability - random removal of any proportion of nodes - network robustness
res3 = Robustness.Random.removal(ps = ps,
                                 corg = cortab,
                                 Top = 0
                                 )
p4 = res3[[1]]
p4

# Extracting data
dat5 = res3[[2]]
# head(dat5)
path = paste(netpath,"/Robustness_Targeted_removal/",sep = "")
fs::dir_create(path)
write.csv(dat5,
          paste(path,"Robustness_Targeted_removal_network.csv",sep = ""))
ggsave(paste(path,"Robustness_Targeted_removal_network.pdf",sep = ""),  p4,width = 8,height = 4)


## Network stability-negative correlation ratio
# Network stability - calculate the proportion of negative correlations
res4 = negative.correlation.ratio(ps = ps,
                                  corg = cortab,
                                  # Top = 500,
                                  degree = TRUE,
                                  zipi = FALSE)
p5 = res4[[1]]
p5

dat6 = res4[[2]]
# Negatively correlated ratio data
# head(dat6)
path = paste(netpath,"/Vulnerability/",sep = "")
fs::dir_create(path)
write.csv(dat6,
          paste(path,"Vnegative.correlation.ratio_network.csv",sep = ""))
ggsave(paste(path,"negative.correlation.ratio_network.pdf",sep = ""),  p5,width = 4,height = 4)


## Network stability-community stability
# Network stability - community stability - only used for pair samples
treat = ps %>% sample_data()
treat$pair = paste( "A",c(rep(1:6,3)),sep = "")
# head(treat)
sample_data(ps) = treat
# In general, there is no time gradient. Here, time is set to F, which means that every combination of two communities is compared.
res5 = community.stability( ps = ps,
                            corg = cor,
                            time = FALSE)
p6 = res5[[1]]
p6

dat7 = res5[[2]]

path = paste(netpath,"/community.stability/",sep = "")
fs::dir_create(path)

write.csv(dat7,
          paste(path,"community.stability.data.csv",sep = ""))
ggsave(paste(path,"community.stability..boxplot.pdf",sep = ""),  p6,width = 4,height = 4)


# Network stability-network survivability
library("pulsar")
res6 = natural.con.microp (
  ps = ps,
  corg = cor,
  norm = TRUE,
  end = 150,# Smaller than the number of nodes in the network
  start = 0
)
p7 = res6[[1]]
p7

dat8  = res6[[2]]
path = paste(netpath,"/Natural_connectivity/",sep = "")
fs::dir_create(path)
write.csv(dat8,
          paste(path,"/Natural_connectivity.csv",sep = ""))
ggsave(paste(path,"/Natural_connectivity.pdf",sep = ""),  p7,width = 5,height = 4)


## Network modularity analysis-module feature vector
id = sample_data(ps)$Group %>% unique()
id
[1] KO OE WT
Levels: KO OE WT
i = 1
netpath = paste(otupath,"/network3_MEs/",sep = "")
dir.create(netpath)
netpath;library(ggClusterNet)
[1] ".//network3_MEs/"
library(igraph)
for (i in 1:length(id)) {
  ps.1 = phyloseq::subset_samples(
    ps,Group %in% c(id[i])
  )
  result = network.2(ps = ps.1, N = 500,
                     big = TRUE,
                     select_layout = TRUE,
                     layout_net = "model_maptree",
                     r.threshold=0.8,
                     p.threshold=0.05,
                     label = FALSE,
                     path = netpath,
                     zipi = F,
                     ncol = 1,
                     nrow = 1,
                     # method = "sparcc",
                     fill = "Phylum"
  )
  # The output group of the node's modularity information is listed as modularity information
  tem <- ggClusterNet::model_maptree(cor =result[[4]],
                                     method = "cluster_fast_greedy",
                                     seed = 12
  )
  node_model = tem[[2]]
  head(node_model)
  tablename <- paste(netpath,"/node_model",".csv",sep = "")
  write.csv(node_model,tablename)
  head(node_model)
  otu = ps.1 %>% 
    filter_OTU_ps(500) %>%
    vegan_otu() %>%
    as.data.frame()
  node_model = node_model[match(colnames(otu),node_model$ID),]
  
  MEList = WGCNA::moduleEigengenes(otu, colors = node_model$group)
  MEs = MEList$eigengenes %>% as.data.frame()
  tablename <- paste(netpath,"/",id[i],"node_characteristic_variables",".csv",sep = "")
  write.csv(MEs,tablename)
}
[1] "KO"
[1] "cor matrix culculating over"
[1] "OE"
[1] "cor matrix culculating over"
[1] "WT"
[1] "cor matrix culculating over"
#detach("package:igraph")

Topological relationship network analysis

# TOPOSCORE paper - Analysis script
# Reference: Derosa, L., et al. (2024). "Custom scoring based on ecological topology of gut microbiota associated with cancer immunotherapy outcome." Cell 187(13): 3373-3389 e3316.

# load required libraries and helper functions
source('data/data_TOPOSCORE/tp_helper.R')

## 1. Discovery analysis set ----
log_msg('####### Discovery analysis set #########')
clin_disc <- load_clin(cohort = 'Disc')
met4_disc <- load_microbiome(clin_disc)

conflicts_prefer(base::order)

### 1.1 CoxPH screen (except Akkermansia) ----
res_surv <- load_or_compute('results/Correlation_network_analysis/TOPOSCORE/res_surv2.rds',
                          screen_surv_met4(clin_disc, met4_disc, type = 'OS'))
plot_surv_forest(res_surv, alpha = 0.05)

# select species based on average HR:
res_surv_filt <- res_surv %>% dplyr::filter(HR <= 0.8 | HR >= 1.25)
selected_species <- res_surv_filt$SPECIES
log_msg('%d/%d species selected', length(selected_species), nrow(res_surv))
plot_surv_forest(res_surv_filt %>% dplyr::arrange(HR))

met4_disc_filt <- met4_disc[, c('Sample_id', selected_species)]
hr_annots <- res_surv_filt %>% mutate(HRCAT = ifelse(HR < 1, 'R', 'NR')) %>% 
  dplyr::select(HRCAT, SPECIES)


### 1.2 Correlation screen ----
res <- load_or_compute('results/Correlation_network_analysis/TOPOSCORE/res_pairs2.rds', screen_pairs(met4_disc_filt))
# filter based on Fisher Bonferroni-corrected p-values <= 0.05
res_filt <- load_or_compute('results/Correlation_network_analysis/TOPOSCORE/res_pairs_filt2.rds', {
  min_p <- bind_rows(list(
    res %>% dplyr::select(VAR = VAR1, FISHER_P),
    res %>% dplyr::select(VAR = VAR2, FISHER_P)
  )) %>% group_by(VAR) %>% summarize(MIN_P = min(FISHER_P)) %>% arrange(-MIN_P)
  sp2 <- min_p[min_p$MIN_P <= 0.05 / nrow(min_p), 'VAR', drop = TRUE]
  res %>% filter(VAR1 %in% sp2 & VAR2 %in% sp2)
})
log_msg('Keeping %d pairs (%d species)', nrow(res_filt), 
        length(unique(c(res_filt$VAR1, res_filt$VAR2))))


### 1.3 Clustering ----
SCORE <- 'fisher_p'
METHOD <- 'ward.D2'
DISTANCE <- 'manhattan'
cc <- cluster_species(res_filt, score = SCORE, method = METHOD, distance = DISTANCE, k = 7) %>% renumber_clusters()
plt_fisher_disc <- plot_score_matrix(res_filt, score = SCORE, method = METHOD, 
                              distance = DISTANCE, annots = list(cc, hr_annots), fontsize = 3)

ggsave(plt_fisher_disc, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_fisher_disc.pdf', width = 30, height = 20, units = "cm")


### 1.4 Definition of SIGB groups ----
cc_names <- unique(cc$CLUSTER)
cc_species <- setNames(lapply(cc_names, function(x) cc[cc$CLUSTER == x, 'SPECIES', drop = TRUE]), cc_names)
SIGB1 <- load_or_compute('results/Correlation_network_analysis/TOPOSCORE/sigb12.rds', c(cc_species$C5, cc_species$C6))
SIGB2 <- load_or_compute('results/Correlation_network_analysis/TOPOSCORE/sigb22.rds', c(cc_species$C1, cc_species$C2, cc_species$C3))


### 1.5 Toposcoring ----
scores_disc <- compute_toposcore(met4_disc, sigb1 = SIGB1, sigb2 = SIGB2)
pred_disc <- clin_disc %>% left_join(scores_disc, by = 'Sample_id') %>% 
  filter(OS12 != '') %>% mutate(OS12 = factor(OS12, levels = c('NR', 'R')))
roc <- calc_roc(pred_disc$OS12, pred_disc$TOPOB01, verbose = TRUE)
log_msg('ROC AUC = %.2f [%.2f - %.2f]', roc$AUC[1], roc$AUC[2], roc$AUC[3])
youden <- roc$ROC_DF %>% mutate(SENS = TPR, SPEC = 1 - FPR) %>% mutate(J = SENS + SPEC - 1)
ggplot(youden, aes(x = THRESHOLD, y = J)) + geom_point()

ycut_nr <- youden[which.max(youden$J), ] # 0.5351351
ycut_r <- youden[which(youden$THRESHOLD > 0.7 & youden$J > 0.23), ] # 0.7911411
log_msg('Cut-off thresholds = %.4f and %.4f', ycut_nr$THRESHOLD, ycut_r$THRESHOLD)
plt_roc <- plot_roc(roc$ROC_DF) + 
  geom_point(data = ycut_nr, color = 'red') +
  geom_point(data = ycut_r, color = 'green')
ggsave(plt_roc, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_roc.pdf', width = 10, height = 10, units = "cm")
plt_kde_disc <- plot_toposcoreb01_density(scores_disc, clin_disc, 
                                lims = c(ycut_r$THRESHOLD, ycut_nr$THRESHOLD))
ggsave(plt_kde_disc, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_kde_disc.pdf', width = 10, height = 10, units = "cm")


### 1.6 Prediction in discovery cohort (full signature) ----
pred_disc <- assign_prediction(pred_disc, ycut_r$THRESHOLD, ycut_nr$THRESHOLD)
hr_disc <- get_hr(pred_disc, type = 'OS', by = 'PRED')
log_msg('Prediction discovery: HR = %.2f [%.2f-%.2f], p = %.1e', hr_disc[1], hr_disc[2], hr_disc[3], hr_disc[4])
fig_km_disc <- print_plot(plot_mykm(pred_disc, type = 'OS', by = 'PRED'))

ggsave(fig_km_disc, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_km_disc.pdf', width = 10, height = 10, units = "cm")

library("survival")
library("survminer")
km_fit <- survfit(Surv(OS, Death) ~ PRED, data = pred_disc)
# Plotting the Kaplan-Meier curve
ggsurvplot(km_fit, data = pred_disc, pval = TRUE, conf.int = TRUE,
           xlab = "Time", ylab = "Survival Probability",
           palette = c("#E7B800", "#2E9FDF"))

pdf(file="results/Correlation_network_analysis/TOPOSCORE/fig_km_disc2.pdf",width = 10,height = 7)
km_fit <- survfit(Surv(OS, Death) ~ PRED, data = pred_disc)
ggsurvplot(km_fit, data = pred_disc,
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE, # Add risk table
           risk.table.col = "strata", # Change risk table color by groups
           linetype = "strata", # Change line type by groups
           surv.median.line = "hv", # Specify median survival
           ggtheme = theme_bw(), # Change ggplot2 theme
           palette = c("#E7B800", "#2E9FDF"))
dev.off()
png 
  2 
### 1.7 Prediction in discovery cohort (short signature) ----
SIGB1_PCR <- c('Enterocloster_bolteae', 'Clostridium_symbiosum', 'Erysipelatoclostridium_ramosum',
               'Hungatella_hathewayi', 'Veillonella_atypica')
SIGB2_PCR <- c('Anaerostipes_hadrus', 'Blautia_wexlerae', 'Coprococcus_comes', 
               'Dorea_formicigenerans', 'Dorea_longicatena', 'Eubacterium_rectale', 
               'Eubacterium_ventriosum', 'Faecalibacterium_prausnitzii', 
               'Gemmiger_formicilis', 'Phocaeicola_massiliensis', 'Roseburia_hominis', 
               'Roseburia_intestinalis', 'Roseburia_inulinivorans', 
               'Ruminococcus_bicirculans', 'Ruminococcus_lactaris')
scores_disc_short <- compute_toposcore(met4_disc, sigb1 = SIGB1_PCR, sigb2 = SIGB2_PCR)
pred_disc_short <- clin_disc %>% left_join(scores_disc_short, by = 'Sample_id') %>% assign_prediction(ycut_r$THRESHOLD, ycut_nr$THRESHOLD)
hr_disc_short <- get_hr(pred_disc_short, type = 'OS', by = 'PRED')
log_msg('Prediction discovery (short): HR = %.2f [%.2f-%.2f], p = %.1e', hr_disc_short[1], hr_disc_short[2], hr_disc_short[3], hr_disc_short[4])
fig_km_disc_short <- print_plot(plot_mykm(pred_disc_short, type = 'OS', by = 'PRED'))

#ggsave(fig_km_disc_short, filename = 'results/TOPOSCORE/fig_km_disc_short.pdf', width = 10, height = 10, units = "cm")

pdf(file="results/Correlation_network_analysis/TOPOSCORE/fig_km_disc_short2.pdf",width = 10,height = 10)
km_fit2 <- survfit(Surv(OS, Death) ~ PRED, data = pred_disc_short)
ggsurvplot(km_fit2, data = pred_disc_short,
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE, # Add risk table
           risk.table.col = "strata", # Change risk table color by groups
           linetype = "strata", # Change line type by groups
           surv.median.line = "hv", # Specify median survival
           ggtheme = theme_bw(), # Change ggplot2 theme
           palette = c("#E7B800", "#2E9FDF"))
dev.off()
png 
  2 
## 2. Validation analysis set ----
log_msg('####### Validation analysis set #########')
clin_valid <- load_clin(cohort = 'Valid')
met4_valid <- load_microbiome(clin_valid)

### 2.1 Toposcoring ----
scores_valid <- compute_toposcore(met4_valid, sigb1 = SIGB1, sigb2 = SIGB2) 
plt_kde_valid <- plot_toposcoreb01_density(scores_valid, clin_valid, lims = c(ycut_r$THRESHOLD, ycut_nr$THRESHOLD))
ggsave(plt_kde_valid, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_kde_valid.pdf', width = 10, height = 10, units = "cm")


### 2.2 Prediction in validation cohort ----
pred_valid <- clin_valid %>% left_join(scores_valid, by = 'Sample_id') %>% 
  assign_prediction(ycut_r$THRESHOLD, ycut_nr$THRESHOLD)
hr_valid <- get_hr(pred_valid, type = 'OS', by = 'PRED')
log_msg('Prediction validation: HR = %.2f [%.2f-%.2f], p = %.1e', hr_valid[1], hr_valid[2], hr_valid[3], hr_valid[4])
fig_km_valid <- print_plot(plot_mykm(pred_valid, type = 'OS', by = 'PRED'))

ggsave(fig_km_valid, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_km_valid.pdf', width = 10, height = 10, units = "cm")

pdf(file="results/Correlation_network_analysis/TOPOSCORE/fig_km_valid2.pdf",width = 10,height = 10)
km_fit3 <- survfit(Surv(OS, Death) ~ PRED, data = pred_valid)
ggsurvplot(km_fit3, data = pred_valid,
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE, # Add risk table
           risk.table.col = "strata", # Change risk table color by groups
           linetype = "strata", # Change line type by groups
           surv.median.line = "hv", # Specify median survival
           ggtheme = theme_bw(), # Change ggplot2 theme
           palette = c("#E7B800", "#2E9FDF"))
dev.off()
png 
  2 
### 2.3 Prediction in validation cohort (short signature) ----
scores_valid_short <- compute_toposcore(met4_valid, sigb1 = SIGB1_PCR, sigb2 = SIGB2_PCR)
pred_valid_short <- clin_valid %>% left_join(scores_valid_short, by = 'Sample_id') %>% assign_prediction(ycut_r$THRESHOLD, ycut_nr$THRESHOLD)
hr_valid_short <- get_hr(pred_valid_short, type = 'OS', by = 'PRED')
log_msg('Prediction validation (short): HR = %.2f [%.2f-%.2f], p = %.1e', hr_valid_short[1], hr_valid_short[2], hr_valid_short[3], hr_valid_short[4])
fig_km_valid_short <- print_plot(plot_mykm(pred_valid_short, type = 'OS', by = 'PRED'))

ggsave(fig_km_valid_short, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_km_valid_short.pdf', width = 10, height = 10, units = "cm")

pdf(file="results/Correlation_network_analysis/TOPOSCORE/fig_km_valid_short2.pdf",width = 10,height = 10)
km_fit4 <- survfit(Surv(OS, Death) ~ PRED, data = pred_valid_short)
ggsurvplot(km_fit4, data = pred_valid_short,
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE, # Add risk table
           risk.table.col = "strata", # Change risk table color by groups
           linetype = "strata", # Change line type by groups
           surv.median.line = "hv", # Specify median survival
           ggtheme = theme_bw(), # Change ggplot2 theme
           palette = c("#E7B800", "#2E9FDF"))
dev.off()
png 
  2 

5.Function Prediction(5.功能预测)

Tax4Fun2-function prediction(Tax4Fun2-功能预测)

source("./function/runRefBlast2.R")
otupath = "./"
funcpath = paste(otupath,"results/Function_Prediction/Tax4Fun2/",sep = "")
dir.create(funcpath)

library(Tax4Fun2)

## 首先配置数据库-可能会经常失败,建议使用这里的备份库
## First configure the database - it may fail frequently, it is recommended to use the backup library here
# buildReferenceData(path_to_working_directory = '.', use_force = FALSE, install_suggested_packages = TRUE)
# Database download link is https://zenodo.org/records/10035668
path_to_reference_data = "./results/Function_Prediction/Tax4Fun2/Tax4Fun2_ReferenceData_v2"

# blast_bin = file.path(path_to_reference_data, "blast_bin/bin/blastn.exe")
# res = system(command = paste(blast_bin, "-help"), intern = T)

otudir = funcpath

# Load package
library(Tax4Fun2)
# 物种注释(Species annotation)
# 指定 OTU 代表序列、Tax4Fun2 库的位置、参考数据库版本、序列比对(blastn)线程数等
# Specify OTU representative sequence, Tax4Fun2 library location, reference database version, sequence alignment (blastn) thread number, etc.
# runRefBlast2(path_to_otus = './data/data.ps/otu.fa',
#             path_to_reference_data = path_to_reference_data,
#             path_to_temp_folder = otudir, database_mode = 'Ref100NR',
#             use_force = TRUE, num_threads = 4)

# runRefBlast2(path_to_otus = './data/data.ps/otu.fa',
#             path_to_reference_data = path_to_reference_data,
#             path_to_temp_folder = otudir, database_mode = 'Ref100NR',
#             use_force = TRUE, num_threads = 4)


# 预测群落功能(Predicting community function)
# 指定 OTU 丰度表、Tax4Fun2 库的位置、参考数据库版本、上步的物种注释结果路径等
# Specify the OTU abundance table, the location of the Tax4Fun2 library, the reference database version, the species annotation result path of the previous step, etc.
# makeFunctionalPrediction(path_to_otu_table = './data/data.ps/otutab.txt',
#                          path_to_reference_data = path_to_reference_data,
#                          path_to_temp_folder = otudir,
#                          database_mode = 'Ref100NR',
#                          normalize_by_copy_number = TRUE,
#                          min_identity_to_reference = 0.97,
#                          normalize_pathways = FALSE)

Funguild-function prediction(Funguild-功能预测)

# Load packages
# BiocManager::install("microeco")
library(EasyStat)
library(microeco)
library(ggplot2)
library("WGCNA")
library(tidyverse)
library(ggtree)
library(SpiecEasi)
library(ggClusterNet)
library(phyloseq)
library(magrittr)
p_list = c("ggplot2", "BiocManager", "devtools","picante", "GUniFrac", "ggalluvial", "rgexf")
for(p in p_list){if (!requireNamespace(p)){install.packages(p)}
  library(p, character.only = TRUE, quietly = TRUE, warn.conflicts = F)}
  
ps = readRDS("./data/dataNEW/ps_ITS.rds")
# ps = ps
# 导入内置真菌数据(Import built-in fungal data)
# data(sample_info_ITS)
# data(otu_table_ITS)
# data(taxonomy_table_ITS)
  
otu = ps %>% vegan_otu() %>%
  t() %>%
  as.data.frame()
  
tax = ps %>% vegan_tax() %>%
  as.data.frame()
# 构建分析对象(Constructing analysis objects)
dataset = microtable$new(sample_table = sample_data(ps), otu_table = otu, tax_table = tax)
# 筛选真菌(Screening fungi)
dataset$tax_table %<>% subset(Kingdom == "Fungi")
  
# 功能预测
t2 = trans_func$new(dataset)
# 计算物种的功能
t2$cal_spe_func()
data = t2$res_spe_func_raw_funguild
  
fugipath = paste(otupath,"results/Function_Prediction/funguild",sep = "")
dir.create(fugipath)
# dir.create("./result_and_plot/Base_diversity_ITS/OTU_220921//funguild")
write.csv(data,paste(fugipath ,"/funguild.csv",sep = ""))


## FAPROTAX
# install.packages("VGAM")
ps = readRDS("./data/dataNEW/ps_16s.rds")
# Load packages
library(microeco)
library(ggplot2)
library("WGCNA")
library(tidyverse)
library(ggtree)
library("SpiecEasi")
library(ggClusterNet)
library(phyloseq)
library(magrittr)

# ps = readRDS("./data/dataNEW/ps_16s.rds")
ps = ps %>%
  filter_OTU_ps(500)
  
otu = ps %>% vegan_otu() %>%
  t() %>%
  as.data.frame()
  
tax = ps %>% vegan_tax() %>%
  as.data.frame()
# 构建分析对象(Constructing analysis objects)
dataset = microtable$new(sample_table = sample_data(ps), otu_table = otu, tax_table = tax)
  
t2 = trans_func$new(dataset)
t2$cal_spe_func()
t2$res_spe_func[1:5, 1:6]
        methanotrophy acetoclastic_methanogenesis
ASV_662             0                           0
ASV_782             0                           0
ASV_874             0                           0
ASV_223             0                           0
ASV_508             0                           0
        methanogenesis_by_disproportionation_of_methyl_groups
ASV_662                                                     0
ASV_782                                                     0
ASV_874                                                     0
ASV_223                                                     0
ASV_508                                                     0
        methanogenesis_using_formate methanogenesis_by_CO2_reduction_with_H2
ASV_662                            0                                       0
ASV_782                            0                                       0
ASV_874                            0                                       0
ASV_223                            0                                       0
ASV_508                            0                                       0
        methanogenesis_by_reduction_of_methyl_compounds_with_H2
ASV_662                                                       0
ASV_782                                                       0
ASV_874                                                       0
ASV_223                                                       0
ASV_508                                                       0
data = t2$res_spe_func
data = data[rowSums(data)> 0,]
betapath = paste(otupath,"results/Function_Prediction/FAPROTAX/",sep = "")
dir.create(betapath)
# dir.create("./result_and_plot/Base_diversity_16s//OTU_220921//FAPROTAX")
write.csv(data,paste(betapath,"/FAPROTAX.csv",sep = ""))
# View Function Group List
t2$func_group_list
$FAPROTAX
$FAPROTAX$`Energy source`
[1] "aerobic_chemoheterotrophy"   "anaerobic_chemoheterotrophy"
[3] "photoautotrophy"             "photoheterotrophy"          

$FAPROTAX$`C-cycle`
 [1] "cellulolysis"            "xylanolysis"            
 [3] "chitinolysis"            "ligninolysis"           
 [5] "fermentation"            "methanogenesis"         
 [7] "methanotrophy"           "methylotrophy"          
 [9] "hydrocarbon_degradation" "oil_bioremediation"     

$FAPROTAX$`N-cycle`
 [1] "nitrogen_fixation"             "nitrification"                
 [3] "aerobic_ammonia_oxidation"     "aerobic_nitrite_oxidation"    
 [5] "nitrate_reduction"             "nitrate_respiration"          
 [7] "nitrite_respiration"           "nitrogen_respiration"         
 [9] "denitrification"               "nitrite_denitrification"      
[11] "nitrate_denitrification"       "nitrous_oxide_denitrification"
[13] "ureolysis"                    

$FAPROTAX$`S-cycle`
[1] "sulfate_respiration"                "sulfur_respiration"                
[3] "sulfite_respiration"                "dark_sulfide_oxidation"            
[5] "respiration_of_sulfur_compounds"    "thiosulfate_respiration"           
[7] "dark_oxidation_of_sulfur_compounds"
# View a category
t2$show_prok_func("methanotrophy")
[1] "elements:C,H; main_element:C; electron_donor:C; electron_acceptor:variable; aerobic:variable; exclusively_prokaryotic:yes; light_dependent:no"

6.Other microbiome analysis(6.其他微生物组分析)

Co-founders analysis

# load packages
# rm(list = ls())
library(vegan)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(rdacca.hp)

# load data
# load species data
otu = read.csv('data/species_data.csv', head = T, row.names=1)
otu <- data.frame(t(otu))

# group data
matadata <- read.table(paste("data/group_data.txt",sep=""), header=T, row.names=1, sep="\t", comment.char="")
otu = otu[rownames(otu) %in% rownames(matadata), ]

# confounding factors
env = read.csv('data/c_index_species_new4.csv', header = T, row.names=1)
env = na.omit(env)
env = env[rownames(env) %in% rownames(matadata), ]

rownames = rownames(env)
rownames = as.data.frame(rownames)
otu$rownames = rownames(otu)
otu = merge(otu, rownames, by = "rownames")
rownames(otu) = otu$rownames
otu = otu[, -1]

## Calculate db-RDA step by step according to the principle
# Calculate the sample distance, taking Bray-curtis distance as an example, details ?vegdist
# dis_bray <- vegdist(otu, method = 'bray')
distance_mat = read.table(paste("data/Species_beta_diversity2.txt",sep=""), header=T, row.names=1, sep="\t", comment.char="")
metadata2 = t(matadata)
distance_mat2 = distance_mat[rownames(distance_mat) %in% rownames(matadata), ]
distance_mat3 = distance_mat2[, colnames(distance_mat2) %in% colnames(metadata2)]
distance_mat = distance_mat3

# Or directly use the existing distance matrix, which is also the Bray-curtis distance
dis_bray <- as.dist(distance_mat)

# PCoA sorting, here add = TRUE is used to correct negative eigenvalues, details ?cmdscale
pcoa <- cmdscale(dis_bray, k = nrow(otu) - 1, eig = TRUE, add = TRUE)

# Extract PCoA sample scores (coordinates)
pcoa_site <- pcoa$point

# db-RDA, multiple regression of environmental variables and PCoA axes
# Execute via the RDA function rda() of the vegan package, details ?rda
db_rda <- rda(pcoa_site, env, scale = FALSE)
# summary(db_rda)

# Passive fitting species score
v.eig <- t(otu) %*% db_rda$CCA$u/sqrt(nrow(otu) - 1)
db_rda$CCA$v <- decostand(v.eig, 'normalize', MARGIN = 2)
v.eig <- t(otu) %*% db_rda$CA$u/sqrt(nrow(otu) - 1)
db_rda$CA$v <- decostand(v.eig, 'normalize', MARGIN = 2)

# Extract the data needed for drawing
score = scores(db_rda)
# score$sites
db_rda$CCA$biplot
              RDA1        RDA2       RDA3
status  0.76440409 -0.48475784  0.4250838
sex     0.98352818  0.17911294 -0.0243079
age    -0.01213581 -0.06874258 -0.9975606
# score$species

CAP1 = score$sites[,1]
CAP2 = score$sites[,2]

seg = as.data.frame(db_rda$CCA$biplot)

CPA_data = as.data.frame(score$sites)
CPA_data$group = rownames(CPA_data)
CPA_data$group = as.character(CPA_data$group)
CPA_data$group = gsub("[0-9]","", CPA_data$group)

# Integrate the above extracted data into the table required for drawing
plotdata = data.frame(rownames(score$sites), CAP1, CAP2, CPA_data$group)
colnames(plotdata) = c('sample','CAP1','CAP2','Group') #为其加上列名(add column name)
# write.csv(plotdata,'data/data5/dbRDA_npc_KO9.csv')

# Calculate the explanation of the first and second principal axes
CAP1_exp = round(db_rda$CCA$eig[1]/sum(db_rda$CCA$eig)*100,2)
CAP2_exp = round(db_rda$CCA$eig[2]/sum(db_rda$CCA$eig)*100,2)

p1 = ggplot(plotdata, aes(CAP1, CAP2)) +
  geom_point(aes(fill = Group, color = Group),size = 1.3) + 
  scale_fill_manual(values = c("#74add1","#a60026"))+
  scale_color_manual(values = c("#74add1","#a60026"))+
  #add confidence intervals
  #stat_ellipse(linetype = 1,level = 0.95,aes(group = Treatment, color = Treatment)) +
  #stat_chull(geom = 'polygon', aes(group = Treatment, color = Treatment, fill = Treatment), alpha = 0.1) +
  xlab(paste('dbRDA1 ( ',CAP1_exp,'%',' )', sep = '')) + 
  ylab(paste('dbRDA2 ( ',CAP2_exp,'%',' )', sep = '')) +
  geom_segment(data = seg, aes(x = 0, y = 0, xend = seg[,1], yend = seg[,2]),
               colour = "red", size = 0.3,
               arrow = arrow(angle = 30, length = unit(0.4, 'cm'))) +
  geom_text_repel(data = seg, segment.colour = 'black',
                  aes(x = seg[,1], y = seg[,2], 
                      label = rownames(seg)),size = 3) +
  geom_vline(aes(xintercept = 0), linetype = 'dotted') +
  geom_hline(aes(yintercept = 0), linetype = 'dotted') +
  theme_bw()+
  theme(text = element_text(family = 'sans', size = 12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()
        ,legend.position = 'none'
        )+ 
  theme(axis.text = element_text(colour = 'black',size = 12))+
  stat_ellipse(#data=plotdata,
               #geom = "polygon",
               level=0.95,
               linetype = 2,size=0.7,
               aes(color=Group),
               alpha=0.8)+
  coord_equal(ratio=0.7)
ggsave(paste("results/Other_microbiome_analysis/dbRDA_Species01.pdf",".pdf", sep=""), p1, width=149 * 1.5, height=80 * 1.5, unit='mm')
p1

# Permutation test
env$status = as.numeric(env$status)
env$sex = as.vector(env$sex)
# env$region = as.vector(env$region)
env$age = as.vector(env$age)

envfit <- envfit(db_rda, env, permutations  = 999)
r <- as.matrix(envfit$vectors$r)
p <- as.matrix(envfit$vectors$pvals)
env.p <- cbind(r,p)
colnames(env.p) <- c("r2","p-value")
KK <- as.data.frame(env.p)
KK$p.adj = p.adjust(KK$`p-value`, method = 'BH')
# KK
# write.csv(as.data.frame(env.p),file="data/rdaenvfit_HP_new_species_01.csv")

# Use rdacca.hp to calculate the effect of each environmental factor
dis_bray <- as.dist(distance_mat)
bray = dis_bray
cap.hp = rdacca.hp(bray, env, method = 'dbRDA', type = 'R2', scale = FALSE)
cap.hp$Total_explained_variation
[1] 0.204
cap.hp$Hier.part
       Unique Average.share Individual I.perc(%)
status 0.0760       -0.0009     0.0751     36.81
sex    0.0788        0.0026     0.0814     39.90
age    0.0539       -0.0063     0.0476     23.33
# Export the explanation of each environmental factor
# write.csv(cap.hp$Hier.part, 'data/env_effect_HP_new_Species_02.csv')

# Unique:每个解释变量单独解释的总变化量的比例,即与其它解释变量不存在共存解释的部分
# Unique: The proportion of the total variation explained by each explanatory variable alone, that is, the part that does not coexist with other explanatory variables
# Average.share: 对每个解释变量与其它解释变量共同解释部分的分割
# Average.share: Split each explanatory variable into its common explanation with other explanatory variables
# Individual: 每个解释变量站总变化量的比例Individual等于Unique和Average.share的总和
# Individual: The proportion of the total variation of each explanatory variable station is equal to the sum of Unique and Average.share
# I.prec(%): 每个解释变量占总被解释变化量的比例,即各解释变量的Individual占Individual总和的百分比,Individual总和等同db-RDA的总解释方差占比
# I.prec(%): The proportion of each explanatory variable to the total explained variance, that is, the percentage of each explanatory variable's individual to the individual sum, and the individual sum is equivalent to the total explained variance proportion of db-RDA

Batch effect removal

library(MMUPHin) # A software package for meta-analysis of microbiome data
library(magrittr) # Pipeline operators in R language
library(dplyr) # Data processing software package
library(ggplot2) # Drawing software package
library(vegan) # Data analysis software package

# Remove batch effects
# Load metadata
meta.all <- read.csv(file = 'data/group_batch.csv', stringsAsFactors = FALSE, header = TRUE, row.names = 1, check.name = FALSE)
# rownames(meta.all) <- meta.all$Run
meta.all$StudyID <- factor(meta.all$StudyID)

# Import relative abundance data of bacterial species
feat.abu <- read.table(file = "data/species_batch.txt", sep = "\t", header = TRUE, check.names = FALSE)

# Sum of Species
feat.abu <- aggregate(. ~ Species, data = feat.abu, sum)
rownames(feat.abu) <- feat.abu$Species
feat.abu <- feat.abu[, -1]

# Ensure feat.abu columns match meta.all rownames
feat.abu <- feat.abu[, rownames(meta.all)]

# Replace NA values with 0 and scale the data
feat.abu[is.na(feat.abu)] <- 0
feat.abu <- feat.abu / 100

# Zero-inflated empirical Bayes adjustment of batch effect in compositional feature abundance data
fit_adjust_batch <- adjust_batch(
  feature_abd = feat.abu,
  batch = "StudyID",
  covariates = c("Group", "Gender", "Age"),
  data = meta.all,
  control = list(verbose = FALSE)
)

# Adjusted feature abundance data
npc_abd_adj <- fit_adjust_batch$feature_abd_adj
npc_abd_adj <- npc_abd_adj * 100

# Save adjusted data to CSV
write.csv(npc_abd_adj, 'results/Other_microbiome_analysis/Species_ra_adj_age_new100.csv')

# Batch Effect Diagram
# Load packages
library(vegan, quietly = TRUE)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(ggthemes)
library(ggpubr)
library(vegan)
library(UpSetR)
library(ggsci)
library(doBy)

# Dissimilarity Indices for Community Ecologists
D_before <- vegdist(t(feat.abu))  # Bray-Curtis distance as default
D_after <- vegdist(t(npc_abd_adj))

# Permutational Multivariate Analysis of Variance Using Distance Matrices
set.seed(1)
fit_adonis_before <- adonis2(D_before ~ StudyID, data = meta.all, permutations = 999, method="bray", binary=F)

# PCoA before batch correction
feat.abu2 <- as.data.frame(t(feat.abu))
dune_dist_before <- vegdist(feat.abu2, method="bray", binary=F)
dune_pcoa_before <- cmdscale(D_before, k= (nrow(feat.abu2) - 1), eig = TRUE, add = TRUE)
dune_pcoa_points_before <- as.data.frame(dune_pcoa_before$points)
sum_eig_before <- sum(dune_pcoa_before$eig)
eig_percent_before <- round(dune_pcoa_before$eig / sum_eig_before * 100, 1)
colnames(dune_pcoa_points_before) <- paste0("PCoA", 1:3)
dune_pcoa_result_before <- cbind(dune_pcoa_points_before, meta.all)

# Plot PCoA before batch correction
conflicts_prefer(ggplot2::theme_minimal)
p1 <- ggplot(dune_pcoa_result_before, aes(x=PCoA1, y=PCoA2, color = Group, shape = StudyID)) +
  geom_point(size=2) +
  scale_shape_manual(values = c(15, 10, 17, 18, 19, 8, 25, 3)) +
  labs(x=paste("PCoA 1 (", eig_percent_before[1], "%)", sep=""),
       y=paste("PCoA 2 (", eig_percent_before[2], "%)", sep="")) +
  theme_classic()

plot1 <- ggscatter(dune_pcoa_result_before, x= "PCoA1", y = "PCoA2", color="StudyID",
                   mean.point = TRUE, star.plot = TRUE, ggtheme = theme_minimal()) +
  labs(x=paste("PCo 1 (", eig_percent_before[1], "%)", sep=""),
       y=paste("PCo 2 (", eig_percent_before[2], "%)", sep="")) +
  theme_classic() +
  geom_vline(xintercept = 0, color = 'gray', size = 0.4) + 
  geom_hline(yintercept = 0, color = 'gray', size = 0.4) +
  theme(panel.grid = element_line(color = 'black', linetype = 2, size = 0.1), 
        panel.background = element_rect(color = 'black', fill = 'transparent'), 
        legend.title = element_blank()) +
  theme(axis.title = element_text(size = 18, colour="black"),
        axis.text = element_text(size = 16, colour = "black"),
        legend.text = element_text(size = 16))

# Print the results of permutation multiple regression analysis of variance after batch correction
set.seed(1)
fit_adonis_after <- adonis2(D_after ~ StudyID, data = meta.all, permutations = 999, method="bray")

# PCoA after batch correction
npc_abd_adj2 <- as.data.frame(t(npc_abd_adj))
dune_dist_after <- vegdist(npc_abd_adj2, method="bray", binary=F)
dune_pcoa_after <- cmdscale(D_after, k= (nrow(npc_abd_adj2) - 1), eig = TRUE, add = TRUE)
dune_pcoa_points_after <- as.data.frame(dune_pcoa_after$points)
sum_eig_after <- sum(dune_pcoa_after$eig)
eig_percent_after <- round(dune_pcoa_after$eig / sum_eig_after * 100, 1)
colnames(dune_pcoa_points_after) <- paste0("PCoA", 1:3)
dune_pcoa_result_after <- cbind(dune_pcoa_points_after, meta.all)

# Plot PCoA after batch correction
p2 <- ggplot(dune_pcoa_result_after, aes(x=PCoA1, y=PCoA2, color = Group, shape = StudyID)) +
  geom_point(size=2) +
  scale_shape_manual(values = c(15, 10, 17, 18, 19, 8, 25, 3)) +
  labs(x=paste("PCo 1 (", eig_percent_after[1], "%)", sep=""),
       y=paste("PCo 2 (", eig_percent_after[2], "%)", sep="")) +
  theme_classic() +
  stat_ellipse(level=0.95, linetype = 2, size=0.7, aes(color=Group), alpha=0.8)

plot2 <- ggscatter(dune_pcoa_result_after, x= "PCoA1", y = "PCoA2", color="StudyID",
                   mean.point = TRUE, star.plot = TRUE, ggtheme = theme_minimal()) +
  labs(x=paste("PCoA 1 (", eig_percent_after[1], "%)", sep=""),
       y=paste("PCoA 2 (", eig_percent_after[2], "%)", sep="")) +
  theme_classic() +
  geom_vline(xintercept = 0, color = 'gray', size = 0.4) + 
  geom_hline(yintercept = 0, color = 'gray', size = 0.4) +
  theme(panel.grid = element_line(color = 'black', linetype = 2, size = 0.1), 
        panel.background = element_rect(color = 'black', fill = 'transparent'), 
        legend.title = element_blank()) +
  theme(axis.title = element_text(size = 18, colour="black"),
        axis.text = element_text(size = 16, colour = "black"),
        legend.text = element_text(size = 16))

# Combine plots
library(patchwork)
p_all <- p1 + p2
ggsave(p_all, file = "results/Other_microbiome_analysis/Compare01.pdf", width = 289, height = 150, unit = 'mm')

p_all2 <- plot1 + plot2
ggsave(p_all2, file = "results/Other_microbiome_analysis/Compare02.pdf", width = 289, height = 100, unit = 'mm')

Halla analysis

#Halla Analysis Implementation
#Implementation of HALLA module through online websites http://galaxy.biobakery.org/

# Inputs
# HAllA requires two tab-delimited text files representing two paired datasets (sets of features) describing the same set of samples.
# The example data inputs can be found in the "example_data" folder or http://galaxy.biobakery.org/

# Outputs
# A pdf - the "halagram.pdf"
# A zip file containing all the files in the generated output directory

# Load packages
library(pheatmap)
library(tidyr)
library(vegan)
library(ComplexHeatmap)

# Graphic beautification and drawing of heat maps
data <- read.table("data/all_associations.txt", header=TRUE, sep="\t")

# Convert to wide data
data2 <- pivot_wider(data, names_from = "Y_features", values_from = "association")
data2 <- as.data.frame(data2)
rownames(data2) <- data2$X_features
data2 <- data2[, -1]

# Draw a heat map
p <- Heatmap(data2, 
             name = "Association",
             row_names_gp = gpar(fontsize = 8),
             cluster_rows = TRUE, cluster_columns = TRUE,
             col = colorRampPalette(colors = c("#87CEFF", "white", "#FF7256"))(50)
             )
# Save plot
pdf("results/Other_microbiome_analysis/halla_association01.pdf", width = 10, height = 8)
draw(p)
dev.off()
png 
  2 
# Draw a heat map with p-values
data_km_q <- read.table("data/all_q_value.txt", header=TRUE, sep="\t")
data_km_q2 <- pivot_wider(data_km_q, names_from = "Y_features", values_from = "q.values")
data_km_q2 <- as.data.frame(data_km_q2)
rownames(data_km_q2) <- data_km_q2$X_features
data_km_q2 <- data_km_q2[, -1]

p2 <- Heatmap(data2, 
              name = "Association",
              cell_fun = function(j, i, x, y, width, height, fill) {
                if (data_km_q2[i, j] < 0.05) {
                  grid.text(sprintf("%.3f", data_km_q2[i, j]), x, y, gp = gpar(fontsize = 6))
                } else {
                  grid.text("", x, y)
                }
              }
              )
# Save plot
pdf("results/Other_microbiome_analysis/halla_association2.pdf", width = 10, height = 8)
draw(p2)
dev.off()
png 
  2 

Procrustes Analysis

# Load packages and data
library(vegan)
library(ggplot2)
data(varespec)
data(varechem)

# Calculate the distance matrix between samples
species_dist <- vegdist(varespec, method = "bray")
env_dist <- vegdist(scale(varechem), method = "euclidean")

# Dimensionality reduction using NMDS
nmds_species <- monoMDS(species_dist)
nmds_env <- monoMDS(env_dist)

# Procrustes Analysis
proc_result <- procrustes(nmds_species, nmds_env, symmetric = TRUE)
summary(proc_result)

Call:
procrustes(X = nmds_species, Y = nmds_env, symmetric = TRUE) 

Number of objects: 24    Number of dimensions: 2 

Procrustes sum of squares:  
 0.6515295 
Procrustes root mean squared error: 
 0.1647637 
Quantiles of Procrustes errors:
       Min         1Q     Median         3Q        Max 
0.02987198 0.10213559 0.14600278 0.17560233 0.34398810 

Rotation matrix:
           [,1]      [,2]
[1,]  0.7730178 0.6343843
[2,] -0.6343843 0.7730178

Translation of averages:
              [,1]         [,2]
[1,] -1.681189e-18 2.334995e-18

Scaling of target:
[1] 0.5903139
# Significance test
set.seed(1)
proc_test <- protest(nmds_species, nmds_env, permutations = 999)
proc_test

Call:
protest(X = nmds_species, Y = nmds_env, permutations = 999) 

Procrustes Sum of Squares (m12 squared):        0.6515 
Correlation in a symmetric Procrustes rotation: 0.5903 
Significance:  0.001 

Permutation: free
Number of permutations: 999
# Extract coordinate data
procrustes_data <- as.data.frame(cbind(proc_result$Yrot, proc_result$X))
rotation_matrix <- as.data.frame(proc_result$rotation)

# Plot
ggplot(procrustes_data) +
  geom_segment(aes(x = V1, y = V2, xend = MDS1, yend = MDS2), 
               color = "#66C2A5", size = 1, arrow = arrow(length = unit(0.25, 'cm'))) +
  geom_point(aes(V1, V2), color = "#FC8D62", size = 4, shape = 16) +
  geom_point(aes(MDS1, MDS2), color = "#8DA0CB", size = 4, shape = 16) +
  theme_classic(base_size = 16) +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1), 
    panel.grid = element_blank(),
    panel.background = element_rect(fill = "transparent", color = NA),
    axis.ticks.length = unit(0.4, "lines"),
    axis.line = element_blank(),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold")
  ) +
  labs(x = "Dimension 1", y = "Dimension 2", 
       title = "Procrustes Analysis: Community vs Environment") +
  geom_vline(xintercept = 0, color = "grey60", linetype = "dashed", size = 0.5) +
  geom_hline(yintercept = 0, color = "grey60", linetype = "dashed", size = 0.5) +
  geom_abline(slope = 1, intercept = 0, color = "black", size = 0.8) + # 第一条对角线,斜率为1
  geom_abline(slope = -1, intercept = 0, color = "black", size = 0.8) + # 第二条对角线,斜率为-1
  annotate("text", label = paste("M2 =", round(proc_test$ss, 4), 
                                 "\nP-value =", format.pval(proc_test$signif)),
           x = 0.2, y = -0.15, size = 4, hjust = 0)

# Save as PPT
library(export)
graph2ppt(file = "results/Other_microbiome_analysis/Procrustes_Analysis_with_Crosslines.ppt", width = 5, height = 5)

CausalMediationAnalysis

# Load packages
#install.packages("mediation")
library(ggplot2)
library(dplyr)
library(mediation)

# Simulate a mediation effect, set the random seed and generate simulated data
set.seed(2344)
df <- iris %>%
  mutate(
    random_factor1 = runif(n(), min = min(Sepal.Length), max = max(Sepal.Length)),
    mediator_var = 0.4 * Sepal.Length + 0.6 * random_factor1,  # Simulating mediating variables
    random_factor2 = runif(n(), min = min(mediator_var), max = max(mediator_var)),
    dependent_var = 0.4 * mediator_var + 0.6 * random_factor2  # Simulating dependent variables
  )

# Statistical analysis
# 1. Calculate the total effect (the total impact of the independent variable on the dependent variable)
total_effect_model <- lm(dependent_var ~ Sepal.Length, data = df)
summary(total_effect_model)

Call:
lm(formula = dependent_var ~ Sepal.Length, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.28365 -0.39582  0.03309  0.41165  1.20451 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   5.19594    0.32851  15.817   <2e-16 ***
Sepal.Length  0.14481    0.05567   2.601   0.0102 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5627 on 148 degrees of freedom
Multiple R-squared:  0.04372,   Adjusted R-squared:  0.03726 
F-statistic: 6.767 on 1 and 148 DF,  p-value: 0.01023
# 2. Calculate the impact of the independent variable on the mediating variable
mediator_model <- lm(mediator_var ~ Sepal.Length, data = df)
summary(mediator_model)

Call:
lm(formula = mediator_var ~ Sepal.Length, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.1320 -0.4904  0.0604  0.4698  0.9850 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)    3.6625     0.3417  10.719  < 2e-16 ***
Sepal.Length   0.4118     0.0579   7.112 4.53e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5852 on 148 degrees of freedom
Multiple R-squared:  0.2547,    Adjusted R-squared:  0.2497 
F-statistic: 50.58 on 1 and 148 DF,  p-value: 4.534e-11
# 3. Calculate the impact of the mediating variable on the dependent variable while controlling the independent variable
mediated_effect_model <- lm(dependent_var ~ Sepal.Length + mediator_var, data = df)
summary(mediated_effect_model)

Call:
lm(formula = dependent_var ~ Sepal.Length + mediator_var, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.84899 -0.37869 -0.01874  0.34800  0.98188 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   3.53127    0.38714   9.122 5.20e-16 ***
Sepal.Length -0.04235    0.05701  -0.743    0.459    
mediator_var  0.45452    0.06988   6.504 1.15e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4975 on 147 degrees of freedom
Multiple R-squared:  0.2574,    Adjusted R-squared:  0.2473 
F-statistic: 25.48 on 2 and 147 DF,  p-value: 3.158e-10
# 4. Causal Mediation Analysis
conflicts_prefer(mediation::mediate)
causal_mediation_results <- mediate(
  model.m = mediator_model, 
  model.y = mediated_effect_model, 
  treat = "Sepal.Length", 
  mediator = "mediator_var", 
  boot = TRUE
)
summary(causal_mediation_results)

Causal Mediation Analysis 

Nonparametric Bootstrap Confidence Intervals with the Percentile Method

               Estimate 95% CI Lower 95% CI Upper p-value    
ACME             0.1872       0.1214         0.26  <2e-16 ***
ADE             -0.0424      -0.1518         0.06    0.43    
Total Effect     0.1448       0.0435         0.25  <2e-16 ***
Prop. Mediated   1.2925       0.7277         4.14  <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Sample Size Used: 150 


Simulations: 1000 
# Plot
plot_mediation <- function(med_result) {
  plot(med_result)+ 
    theme_minimal() +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      axis.title = element_text(size = 12, face = "bold"),
      axis.text = element_text(size = 10),
    ) +
    labs(
      x = "Effect Size",
      y = "Variable"
    )
}

# Plot
plot_mediation <- function(med_result) {
  plot(med_result, main = "Mediation Analysis Results")
}

# Plot
plot_mediation(causal_mediation_results)

# Save
pdf("results/Other_microbiome_analysis/Mediation_Analysis_Plot.pdf", width = 8, height = 6)
plot_mediation(causal_mediation_results)
dev.off()
png 
  2 

Sparse PLS Analysis (sPLS_Analysis)

# BiocManager::install("mixOmics")
# install.packages("C:/Program Files/R/R-4.3.1/library/mixOmics_6.28.0.tar.gz", repos = NULL, type="source")
library(mixOmics)
library(RColorBrewer)

conflicts_prefer(mixOmics::pca)
conflicts_prefer(mixOmics::splsda)

# PCA analysis, mainly combining multiple independent variables in a linear manner for dimensionality reduction analysis
data(liver.toxicity)
X <- liver.toxicity$gene
pca_result <- pca(X)

# Set color
custom_colors <- brewer.pal(4, "Set1")
plotIndiv(pca_result, group = liver.toxicity$treatment$Dose.Group,
          ind.names = FALSE, legend = TRUE,
          col = custom_colors, title = 'Liver toxicity: PCA comp 1 - 2',
          legend.title = 'Dose', size.title = rel(1.2),
          legend.title.pch = 'Exposure')

# Set theme
theme_set(theme_classic(base_size = 14) + 
          theme(legend.position = "bottom", 
                axis.title.x = element_text(size = 16),
                axis.title.y = element_text(size = 16)))

plot(pca_result)

p1<- plotLoadings(pca_result, ndisplay = 100, comp = 2, 
             name.var = liver.toxicity$gene.ID[, "geneBank"],
             col = custom_colors[2], size.name = rel(0.5))

#p1

#2. Sparse matrix PCA analysis, this function has an additional keepX parameter compared to PCA, which can be set to the top few genes or samples that work between each group
MyResult.spca <- spca(X, ncomp = 3, keepX = c(15, 10, 5))

# Make sure you assign a color to each group
n_groups <- length(unique(liver.toxicity$treatment$Dose.Group))
col_palette <- brewer.pal(n_groups, "Set1")

# Plot
plotIndiv(MyResult.spca, 
          group = liver.toxicity$treatment$Dose.Group, 
          pch = as.factor(liver.toxicity$treatment$Time.Group), 
          col = col_palette,
          legend = TRUE, 
          title = 'Liver Toxicity: Sparse PCA - Comp 1 vs Comp 2')

#This graph is a polar coordinate graph, mainly reflecting the correlation distribution between variables.

#In PCA, you can also directly look at the explained variance score (which mainly reflects the contribution of the independent variable to the dependent variable, ranging from 0 to 1, with larger values indicating greater contribution), which requires the use of the function tune. pca (X)

# 3.PLS-DA analysis, although partial least squares method was not initially applied to classification and discrimination problems. Later, after renovation, it was still used for classification research.
data(srbct)
X <- srbct$gene
Y <- srbct$class

# sPLS-DA analysis
MyResult.splsda <- splsda(X, Y, keepX = c(50, 50))

# Set color
n_groups <- length(unique(Y))
col_palette <- brewer.pal(n_groups, "Set1") # 根据组的数量选择调色板

# Plot
plotIndiv(MyResult.splsda, 
          col = col_palette, 
          legend = TRUE, 
          title = 'sPLS-DA: SRBCT Data')

#4. PLS analysis, partial least squares regression is a multivariate method that designs two data matrices X and Y. By modeling the structure of two matrices, PLS goes beyond traditional multiple regression. Unlike traditional multiple regression models, it is not limited to unrelated variables. One of the advantages of PLS is that it can handle many noisy, collinear, and missing variables, and can also model several response variables simultaneously in Y.
# Load data
data(nutrimouse)
X <- nutrimouse$gene
Y <- nutrimouse$lipid

# sPLS analysis
MyResult.spls <- spls(X, Y, keepX = c(25, 25), keepY = c(5, 5))
#MyResult.spls$names

# Set color
n_groups <- length(unique(nutrimouse$genotype))
print(n_groups)  # 2
[1] 2
# Set color
col_palette <- c("blue", "red")

# Plot
plotIndiv(MyResult.spls, 
          group = nutrimouse$genotype, 
          rep.space = "XY-variate", 
          legend = TRUE, 
          col = col_palette, 
          ind.names = nutrimouse$diet, 
          title = 'Nutrimouse: sPLS')

p2<- plotLoadings(MyResult.spls)

#p2

# 5. DIABLO analysis, equivalent to an extension of PLS, can introduce multiple matrices in X.
# Load data
data(breast.TCGA)
X <- list(mRNA = breast.TCGA$data.train$mrna,
          miRNA = breast.TCGA$data.train$mirna,
          protein = breast.TCGA$data.train$protein)
Y <- breast.TCGA$data.train$subtype
list.keepX <- list(mRNA = c(16, 17), miRNA = c(18, 5), protein = c(5, 5))
MyResult.diablo <- block.splsda(X, Y, keepX = list.keepX)

# Set color
n_groups <- length(unique(Y))  
col_palette_indiv <- brewer.pal(n_groups, "Set1")
col_palette_var <- brewer.pal(n_groups, "Paired") 

# Plot
plotIndiv(MyResult.diablo, 
          ind.names = FALSE, 
          legend = TRUE, 
          cex = c(1, 2, 3), 
          title = 'BRCA with DIABLO',
          col = col_palette_indiv)

# Plot
plotVar(MyResult.diablo, 
        var.names = c(FALSE, FALSE, TRUE),
        legend = TRUE, 
        pch = c(16, 16, 1), 
        col = col_palette_var)

# DIABLO
plotDiablo(MyResult.diablo, ncomp = 1)

# Circos
circosPlot(MyResult.diablo, cutoff = 0.7)

# ROC curve
Myauc.diablo <- auroc(MyResult.diablo, roc.block = "miRNA", roc.comp = 2)

$mRNA
$mRNA$comp1
                     AUC p-value
Basal vs Other(s) 0.9970 0.00000
Her2 vs Other(s)  0.6106 0.06149
LumA vs Other(s)  0.9883 0.00000

$mRNA$comp2
                     AUC   p-value
Basal vs Other(s) 0.9992 0.000e+00
Her2 vs Other(s)  0.9703 1.776e-15
LumA vs Other(s)  0.9970 0.000e+00


$miRNA
$miRNA$comp1
                     AUC p-value
Basal vs Other(s) 0.9551  0.0000
Her2 vs Other(s)  0.5650  0.2716
LumA vs Other(s)  0.9239  0.0000

$miRNA$comp2
                     AUC  p-value
Basal vs Other(s) 0.9623 0.00e+00
Her2 vs Other(s)  0.8650 6.67e-10
LumA vs Other(s)  0.9589 0.00e+00


$protein
$protein$comp1
                     AUC  p-value
Basal vs Other(s) 0.9524 0.000000
Her2 vs Other(s)  0.6678 0.004542
LumA vs Other(s)  0.9874 0.000000

$protein$comp2
                     AUC   p-value
Basal vs Other(s) 0.9790 0.000e+00
Her2 vs Other(s)  0.9256 6.111e-13
LumA vs Other(s)  0.9950 0.000e+00
#You can also use the caret package in R software to achieve

Generalized Estimating Equation Analysis

# Load packages
library(geepack)
library(tidyverse)

# Load data
data("dietox")

# Data processiing
dietox <- dietox %>%
  mutate(Cu = factor(Cu),
         Evit = factor(Evit))

# Define fomula
model_formula <- Weight ~ Time + Evit + Cu

# Independence correlation structure
gee_independent <- geeglm(model_formula, id = Pig, data = dietox, 
                          family = gaussian, corstr = "ind")
summary(gee_independent)

Call:
geeglm(formula = model_formula, family = gaussian, data = dietox, 
    id = Pig, corstr = "ind")

 Coefficients:
            Estimate  Std.err     Wald Pr(>|W|)    
(Intercept) 15.07283  1.42190  112.371   <2e-16 ***
Time         6.94829  0.07979 7582.549   <2e-16 ***
EvitEvit100  2.08126  1.84178    1.277    0.258    
EvitEvit200 -1.11327  1.84830    0.363    0.547    
CuCu035     -0.78865  1.53486    0.264    0.607    
CuCu175      1.77672  1.82134    0.952    0.329    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation structure = independence 
Estimated Scale Parameters:

            Estimate Std.err
(Intercept)    48.28   9.309
Number of clusters:   72  Maximum cluster size: 12 
anova(gee_independent)
Analysis of 'Wald statistic' Table
Model: gaussian, link: identity
Response: Weight
Terms added sequentially (first to last)

     Df   X2 P(>|Chi|)    
Time  1 7507    <2e-16 ***
Evit  2    4      0.15    
Cu    2    2      0.41    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Interpretation of results: Time variable significantly affects weight

# Exchangeable correlation structure
gee_exchangeable <- geeglm(model_formula, id = Pig, data = dietox, 
                           family = gaussian, corstr = "ex")
summary(gee_exchangeable)

Call:
geeglm(formula = model_formula, family = gaussian, data = dietox, 
    id = Pig, corstr = "ex")

 Coefficients:
            Estimate Std.err    Wald Pr(>|W|)    
(Intercept)  15.0984  1.4206  112.96   <2e-16 ***
Time          6.9426  0.0796 7605.79   <2e-16 ***
EvitEvit100   2.0414  1.8431    1.23     0.27    
EvitEvit200  -1.1103  1.8452    0.36     0.55    
CuCu035      -0.7652  1.5354    0.25     0.62    
CuCu175       1.7871  1.8189    0.97     0.33    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation structure = exchangeable 
Estimated Scale Parameters:

            Estimate Std.err
(Intercept)     48.3    9.31
  Link = identity 

Estimated Correlation Parameters:
      Estimate Std.err
alpha    0.766  0.0326
Number of clusters:   72  Maximum cluster size: 12 
anova(gee_exchangeable)
Analysis of 'Wald statistic' Table
Model: gaussian, link: identity
Response: Weight
Terms added sequentially (first to last)

     Df   X2 P(>|Chi|)    
Time  1 7604    <2e-16 ***
Evit  2    4      0.16    
Cu    2    2      0.41    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Interpretation of results: Time variable is still the only significant influencing factor

# Autoregressive correlation structure
gee_ar1 <- geeglm(model_formula, id = Pig, data = dietox, 
                  family = gaussian, corstr = "ar1")
summary(gee_ar1)

Call:
geeglm(formula = model_formula, family = gaussian, data = dietox, 
    id = Pig, corstr = "ar1")

 Coefficients:
            Estimate Std.err    Wald Pr(>|W|)    
(Intercept)  17.6124  1.3354  173.95   <2e-16 ***
Time          6.7324  0.0756 7921.11   <2e-16 ***
EvitEvit100   2.3782  1.7676    1.81     0.18    
EvitEvit200  -0.9779  1.7369    0.32     0.57    
CuCu035      -0.3976  1.3928    0.08     0.78    
CuCu175       1.2376  1.7376    0.51     0.48    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation structure = ar1 
Estimated Scale Parameters:

            Estimate Std.err
(Intercept)     50.5    9.41
  Link = identity 

Estimated Correlation Parameters:
      Estimate Std.err
alpha    0.933  0.0116
Number of clusters:   72  Maximum cluster size: 12 
anova(gee_ar1)
Analysis of 'Wald statistic' Table
Model: gaussian, link: identity
Response: Weight
Terms added sequentially (first to last)

     Df   X2 P(>|Chi|)    
Time  1 7907    <2e-16 ***
Evit  2    5      0.07 .  
Cu    2    1      0.65    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Interpretation of the results: The Time variable continues to remain significant, and the coefficient and standard error have almost remained unchanged.

# Unstructured correlation structure
gee_unstructured <- geeglm(model_formula, id = Pig, data = dietox, 
                           family = gaussian, corstr = "unstructured")
summary(gee_unstructured)

Call:
geeglm(formula = model_formula, family = gaussian, data = dietox, 
    id = Pig, corstr = "unstructured")

 Coefficients:
            Estimate Std.err    Wald Pr(>|W|)    
(Intercept)   14.574   1.720   71.79   <2e-16 ***
Time           7.544   0.131 3329.24   <2e-16 ***
EvitEvit100   -2.312   2.218    1.09     0.30    
EvitEvit200   -1.790   1.952    0.84     0.36    
CuCu035        1.082   2.098    0.27     0.61    
CuCu175        2.424   2.264    1.15     0.28    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation structure = unstructured 
Estimated Scale Parameters:

            Estimate Std.err
(Intercept)     63.3    12.5
  Link = identity 

Estimated Correlation Parameters:
            Estimate Std.err
alpha.1:2     0.2950  0.0435
alpha.1:3     0.2586  0.0498
alpha.1:4     0.1974  0.0670
alpha.1:5     0.1625  0.0771
alpha.1:6     0.0811  0.0915
alpha.1:7     0.1141  0.1029
alpha.1:8     0.1293  0.1050
alpha.1:9     0.1126  0.1197
alpha.1:10    0.1022  0.1400
alpha.1:11    0.1720  0.1479
alpha.1:12    0.0874  0.1780
alpha.2:3     0.3831  0.0572
alpha.2:4     0.3906  0.0621
alpha.2:5     0.4058  0.0631
alpha.2:6     0.3787  0.0625
alpha.2:7     0.4091  0.0659
alpha.2:8     0.4155  0.0664
alpha.2:9     0.4100  0.0737
alpha.2:10    0.3950  0.0865
alpha.2:11    0.4403  0.0966
alpha.2:12    0.4133  0.1235
alpha.3:4     0.5304  0.0701
alpha.3:5     0.5721  0.0663
alpha.3:6     0.5694  0.0606
alpha.3:7     0.5861  0.0644
alpha.3:8     0.5884  0.0651
alpha.3:9     0.6040  0.0700
alpha.3:10    0.5897  0.0811
alpha.3:11    0.6292  0.0890
alpha.3:12    0.6416  0.1100
alpha.4:5     0.7128  0.0537
alpha.4:6     0.7500  0.0464
alpha.4:7     0.7617  0.0430
alpha.4:8     0.7673  0.0433
alpha.4:9     0.7910  0.0482
alpha.4:10    0.7828  0.0579
alpha.4:11    0.8195  0.0686
alpha.4:12    0.8598  0.0887
alpha.5:6     0.9187  0.0571
alpha.5:7     0.9272  0.0512
alpha.5:8     0.9260  0.0443
alpha.5:9     0.9625  0.0420
alpha.5:10    0.9306  0.0496
alpha.5:11    0.9788  0.0532
alpha.5:12    1.0413  0.0684
alpha.6:7     1.0427  0.0712
alpha.6:8     1.0405  0.0622
alpha.6:9     1.0937  0.0637
alpha.6:10    1.0680  0.0543
alpha.6:11    1.1005  0.0585
alpha.6:12    1.2003  0.0706
alpha.7:8     1.1054  0.0619
alpha.7:9     1.1533  0.0552
alpha.7:10    1.1307  0.0492
alpha.7:11    1.1735  0.0487
alpha.7:12    1.2591  0.0762
alpha.8:9     1.1819  0.0533
alpha.8:10    1.1529  0.0504
alpha.8:11    1.1942  0.0510
alpha.8:12    1.2917  0.0841
alpha.9:10    1.2467  0.0510
alpha.9:11    1.3051  0.0558
alpha.9:12    1.4178  0.0882
alpha.10:11   1.3561  0.0627
alpha.10:12   1.4596  0.1023
alpha.11:12   1.5600  0.1103
Number of clusters:   72  Maximum cluster size: 12 
anova(gee_unstructured)
Analysis of 'Wald statistic' Table
Model: gaussian, link: identity
Response: Weight
Terms added sequentially (first to last)

     Df    X2 P(>|Chi|)    
Time  1 20.21   6.9e-06 ***
Evit  2  7.01      0.03 *  
Cu    2  1.15      0.56    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Result interpretation: The Evit variable shows a certain influence under this correlation structure, and the coefficient and standard error have changed

# Model summary
model_summaries <- list(
  "Independence" = summary(gee_independent),
  "Exchangeable" = summary(gee_exchangeable),
  "AR(1)" = summary(gee_ar1),
  "Unstructured" = summary(gee_unstructured)
)

# It can be seen from the model results that the Time variable shows a significant impact under different correlation structure assumptions.
# The Evit variable only shows significance under the unstructured correlation assumption, which indicates that different correlation structure choices may have a certain impact on the model results.

Structural Equation Models (SEM)

# Load packages
library(lavaan)
library(semPlot)

# Load data
mydata <- read.csv("data/meta.csv", row.names = 1)
mydata$status_dummy <- ifelse(mydata$status == 2, 1, 0)

# Get the numeric variables and exclude the status and status_dummy columns
numeric_vars <- names(mydata)[sapply(mydata, is.numeric)]
numeric_vars_to_scale <- numeric_vars[!(numeric_vars %in% c("status", "status_dummy"))]

# Standardize numerical variables
mydata_scaled <- mydata
mydata_scaled[, numeric_vars_to_scale] <- scale(mydata[, numeric_vars_to_scale])

# Define SEM model
model <- '
  latentBact =~ Bacteria_richness  
  latentFung =~ Fungi_richness  
  latentFAPROTAX =~ FAPROTAX  
  latentFunGuild =~ FunGuild

  latentBact ~ OM + status_dummy
  latentFung ~ OM + status_dummy
  latentFAPROTAX ~ OM + status_dummy
  latentFunGuild ~ OM + status_dummy

  latentBact ~~ latentFung
'

# Fitting the model
fit <- sem(model, data = mydata_scaled, int.ov.free = TRUE)

# Save model summary, coefficients, and ANOVA results to file
capture.output(
  summary(fit),        
  coef(fit),        
  anova(fit),
  file = "results/Other_microbiome_analysis/model01_stats.txt"
)

# Draw a structural equation model diagram and optimize the layout and color matching
sem_plot <- semPaths(
  fit,
  what = "est",                 # Display path coefficients
  edge.label.cex = 1.0,          # Increase edge label font size
  layout = "tree2",              # Use a clearer tree layout
  style = "lisrel",              # Use Lisrel styles to simplify graphics
  color = list(lat = "#76C7C0", man = "#F7C04A", edge = "gray50"),  # Set color
  sizeMan = 8,                   # Set the node size of the observation variable
  sizeLat = 10,                  # Setting the node size of latent variables
  residuals = FALSE,             # Do not display residuals
  mar = c(6, 6, 6, 6)            # Set margins to increase the sense of space
)

# Save plot
pdf("results/Other_microbiome_analysis/SEM_Plot01.pdf", width = 10, height = 8)
semPaths(fit, what = "est", edge.label.cex = 1.0, layout = "tree2",
         style = "lisrel", color = list(lat = "#76C7C0", man = "#F7C04A", edge = "gray50"),
         sizeMan = 8, sizeLat = 10, residuals = FALSE, mar = c(6, 6, 6, 6))
dev.off()
png 
  2 
# Save plot
png("results/Other_microbiome_analysis/SEM_Plot01.png", width = 1200, height = 960, res = 150)
semPaths(fit, what = "est", edge.label.cex = 1.0, layout = "tree2",
         style = "lisrel", color = list(lat = "#76C7C0", man = "#F7C04A", edge = "gray50"),
         sizeMan = 8, sizeLat = 10, residuals = FALSE, mar = c(6, 6, 6, 6))
dev.off()
png 
  2 

SurvivalAnalysis

# Load packages
library(survival)
library(dplyr)
library(autoReg)
library(rrtable)
library(survminer)
library(jstable)
library(patchwork)

# Load data
mydata <- survival::lung %>% na.omit()
mydata <- mydata %>% mutate(sex = factor(sex), ph.ecog = factor(ph.ecog))

# Descriptive Statistics: Generate a table of baseline characteristics
table1 <- gaze(status ~ ., data = mydata) %>% myft()
table2 <- gaze(status ~ ., data = mydata, method = 3) %>% myft()

# Fitting survival curves
fit <- survfit(Surv(time, status) ~ sex, data = mydata)

# Survival curves
pdf("results/Other_microbiome_analysis/survival_plot.pdf", width = 10, height = 8)
surv_plot <- ggsurvplot(
  fit, 
  data = mydata, 
  conf.int = TRUE, 
  pval = TRUE, 
  pval.size = 4.5,
  surv.median.line = "hv", 
  risk.table = TRUE, 
  risk.table.height = 0.2,
  risk.table.col = "strata",
  xlab = "Follow-up time (days)", 
  legend.title = "Sex", 
  legend.labs = c("Male", "Female"), 
  palette = c("#E64B35FF", "#4DBBD5FF"), 
  break.x.by = 100,
  ggtheme = theme_classic() + 
    theme(
      plot.title = element_text(size = 14, face = "bold"),
      axis.title = element_text(size = 12),
      axis.text = element_text(size = 10)
    )
)
surv_plot
dev.off()
png 
  2 
# Use patchwork to combine the main survival curve plot with additional components such as risk tables
# complete_plot <- surv_plot$plot / surv_plot$table

# Save plot
# ggsave("survival_plot.pdf", 
#        #plot = complete_plot,
#        plot = surv_plot$plot,
#        device = "pdf", 
#        width = 8, height = 10)

# Cox regression model fitting and presentation
cox_fit <- coxph(Surv(time, status) ~ age + sex + ph.ecog + ph.karno + pat.karno + meal.cal + wt.loss, 
                 data = mydata)
cox_result <- autoReg(cox_fit, uni = TRUE) %>% myft()

# Export Cox regression results as PPTX
table2pptx(cox_result) 

# Forest plot of Cox model results
forest_plot <- ggforest(
  cox_fit, 
  data = mydata, 
  main = "Hazard Ratios (95% CI)", 
  fontsize = 0.9, 
  cpositions = c(0.02, -0.1, 0.3),
  refLabel = "Reference",
  noDigits = 2
)
forest_plot

# Save plot
ggsave("results/Other_microbiome_analysis/cox_forest_plot.pdf", plot = forest_plot, device = "pdf", width = 9, height = 6)


# Variable truncation and grouping
mydata$status <- mydata$status - 1
mydata$sex <- ifelse(mydata$sex == 1, "Male", "Female")
cutpoints <- surv_cutpoint(mydata, time = "time", event = "status",
                           variables = c("age", "meal.cal", "ph.karno", "pat.karno"))
mydata_cut <- surv_categorize(cutpoints)
# Use cbind to merge mydata_cut with other columns of mydata
mydata <- cbind(mydata[, c("sex", "ph.ecog", "wt.loss")], mydata_cut)

# Check the structure of the merged dataset
str(mydata)
'data.frame':   167 obs. of  9 variables:
 $ sex      : chr  "Male" "Male" "Male" "Female" ...
 $ ph.ecog  : Factor w/ 4 levels "0","1","2","3": 1 2 2 3 3 2 3 2 2 2 ...
 $ wt.loss  : num  15 11 0 10 1 16 34 27 60 -5 ...
 $ time     : num  455 210 1022 310 361 ...
 $ status   : num  1 1 0 1 1 1 1 1 1 1 ...
 $ age      : chr  "high" "low" "high" "high" ...
 $ meal.cal : chr  "high" "high" "high" "low" ...
 $ ph.karno : chr  "high" "high" "low" "low" ...
 $ pat.karno: chr  "high" "low" "high" "low" ...
# Subgroup analysis
subgroup_res <- TableSubgroupMultiCox(
  formula = Surv(time, status) ~ sex, 
  var_subgroups = c("age", "meal.cal", "ph.karno", "pat.karno"), 
  data = mydata
)
write.csv(subgroup_res, "results/Other_microbiome_analysis/subgroup_analysis_results.csv")

environmental factors-difference analysis(环境因子-差异分析)

# Analysis of differences among groups in environmental factors
env = read.csv("./data/env.csv")
data = as.data.frame(env)
data = as.tibble(data)
result = EasyStat::MuiaovMcomper2(data = data,num = c(3:ncol(data)))

result1 = EasyStat::FacetMuiPlotresultBox(data = data,
                                          num = c(3:6),
                                          result = result,
                                          sig_show ="abc",ncol = 5 )

mytheme1 = theme_classic() + theme(
  panel.background=element_blank(),
  panel.grid=element_blank(),
  legend.position="right",
  legend.title = element_blank(),
  legend.background=element_blank(),
  legend.key=element_blank(),
  # legend.text= element_text(size=7),
  # text=element_text(),
  # axis.text.x=element_text(angle=45,vjust=1, hjust=1)
  plot.title = element_text(vjust = -8.5,hjust = 0.1),
  axis.title.y =element_text(size = 15,face = "bold",colour = "black"),
  axis.title.x =element_text(size = 15,face = "bold",colour = "black"),
  axis.text = element_text(size = 10,face = "bold"),
  axis.text.x = element_text(colour = "black",size = 10),
  axis.text.y = element_text(colour = "black",size = 10),
  legend.text = element_text(size = 10,face = "bold")
)

colset1 <- RColorBrewer::brewer.pal(9,"Set1")

p1_1 = result1[[1]] + 
  mytheme1 +
  ggplot2::guides(fill = guide_legend(title = NULL)) +
  ggplot2::scale_fill_manual(values = colset1)
p1_1

p1_1 = result1[[2]] %>% ggplot(aes(x=group , y=dd )) + 
  geom_violin(alpha=1, aes(fill=group)) +
  geom_jitter( aes(color = group),position=position_jitter(0.17), size=3, alpha=0.5)+
  labs(x="", y="")+
  facet_wrap(.~name,scales="free_y",ncol  = 3) +
  # theme_classic()+
  geom_text(aes(x=group , y=y ,label=stat)) +
  # ggplot2::scale_x_discrete(limits = axis_order) + 
  mytheme1 +
  guides(color=guide_legend(title = NULL),
         shape=guide_legend(title = NULL),
         fill = guide_legend(title = NULL)
  ) +
  ggplot2::scale_fill_manual(values = colset1)
p1_1

res = EasyStat::FacetMuiPlotresultBar(data = data,num = c(3:ncol(data)),result = result,sig_show ="abc",ncol = 5)
p1_2 = res[[1]]+ 
  # scale_x_discrete(limits = axis_order) +
  
  guides(color = FALSE) +
  mytheme1+ 
  guides(fill = guide_legend(title = NULL))+
  scale_fill_manual(values = colset1)
p1_2

res = EasyStat::FacetMuiPlotReBoxBar(data = data,num = c(3:ncol(data)),result = result,sig_show ="abc",ncol = 5)
p1_3 = res[[1]]+ 
  # scale_x_discrete(limits = axis_order) + 
  mytheme1 + 
  guides(fill = guide_legend(title = NULL))+
  scale_fill_manual(values = colset1)
p1_3

gnum = 4
FileName <- paste("results/Other_microbiome_analysis/env_diff/env_Facet_box", ".pdf", sep = "")
ggsave(FileName, p1_1, width = ((1 + gnum) * 3), height =4*gnum,limitsize = FALSE)

FileName <- paste("results/Other_microbiome_analysis/env_diff/env_Facet_bar", ".pdf", sep = "")
ggsave(FileName, p1_2, width = ((1 + gnum) * 3), height = 4*gnum,limitsize = FALSE)

FileName <- paste("results/Other_microbiome_analysis/env_diff/env_Facet_boxbar", ".pdf", sep = "")
ggsave(FileName, p1_3, width = ((1 + gnum) * 3), height = 4*gnum,limitsize = FALSE)

FileName <- paste("results/Other_microbiome_analysis/env_diff/env_Facet_box", ".jpg", sep = "")
ggsave(FileName, p1_1, width = ((1 + gnum) * 3), height =4*gnum,limitsize = FALSE)

FileName <- paste("results/Other_microbiome_analysis/env_diff/env_Facet_bar", ".jpg", sep = "")
ggsave(FileName, p1_2, width = ((1 + gnum) * 3), height = 4*gnum,limitsize = FALSE)

FileName <- paste("results/Other_microbiome_analysis/env_diff/env_Facet_boxbar", ".jpg", sep = "")
ggsave(FileName, p1_3, width = ((1 + gnum) * 3), height = 4*gnum,limitsize = FALSE)

environmental factors-correlation analysis(环境因子相关分析)

# Load packages
library(vegan)
library(ggrepel)

# Load data
set.seed(365)
env = read.csv("./data/env2.csv")
data1 <- env[, 3:36]

data1 <- decostand(data1, method = "hellinger")
data2 = as.data.frame(env$group)
res <- rda(data1 ~ . , data2)

# Select RDA1, RDA2 axis for visualization
centroids <- as.data.frame(res$CCA$centroids[,c(1,2)]) 

# Add a column to the centroids data frame to distinguish shapes and color them
centroids$group <- factor(c(rep('A',2),rep('B',2)), levels = c('A', 'B')) 

rda.v <- as.data.frame(res$CCA$v[,c(1,2)])
# Extract the response variable name for displaying labels in the plot
rda.v$name = row.names(rda.v) 

arrow_data <- data.frame(x=rda.v[,1], y = rda.v[,2], x_end=0, y_end=0, name=rda.v[,3], col='blue')
arrow_data[arrow_data$name %in% c('AP', 'LB', 'LRL1', 'RGR'), ]$col <- 'red'

p1 <- ggplot(data = centroids) + 
  geom_point(size=2, aes(x = RDA1,y=RDA2,color=group, shape=group))+
  ggrepel::geom_text_repel(data = arrow_data, aes(x,y,label=name),
                           size=3, fontface="italic")+ 
  geom_segment(data = arrow_data,
               aes(x=0, y=0, xend=x, yend=y), 
               arrow = arrow(length = unit(0.05,"inches")), color = arrow_data$col, size=.8)+ 
  geom_hline(yintercept = 0, linetype = "dashed", size=1.2) + 
  geom_vline(xintercept = 0,linetype = "dashed", size=1.2)+ 
  theme(legend.title = element_blank(), legend.position = c(0.9,0.8), 
        legend.background = element_blank()) + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"), 
        panel.border =element_rect(colour = "black", fill=NA, size=1)) + 
  labs(x = 'RDA1(0.734%)',y = 'RDA2(0.048%)')
p1

ggsave('results/Other_microbiome_analysis/RDA_plot.pdf', p1, height = 6, width = 7)

Microbial and environmental factors - co-ordination_RDA/CCA(微生物和环境因子共排序RDA/CCA)

# Load packages
library(vegan)
library(ggrepel)
library(ggplot2)

# Before conducting the ordination analysis, we need to perform a detrended correspondence analysis (DCA) on the species community data and make a judgment based on the value of Lengths of gradient in the result. The result will give 4 Lengths of gradient values. If the largest value is greater than 4.0, choose CCA. If it is between 3.0-4.0, choose RDA or CCA. If it is less than 3.0, the result of RDA is better than CCA. However, this standard is not 100% appropriate. In actual use, it is best to perform CCA and RDA at the same time and make a choice based on the results.

# Loading sample species community data
# sampledata <- read.table("otu_table.txt", head = TRUE, row.names=1,sep="\t")
sampledata <- read.csv("data/test_otu.csv",row.names = 1)
sampledata <- sampledata[ , 1:24]

# Analysis requires data table rows as samples, columns as species
sampledata <- t(sampledata)

# DCA analysis
dca <- decorana(veg = sampledata)
# Call up the Lengths of gradient value and save it
dca1 <- max(dca$rproj[,1])
dca2 <- max(dca$rproj[,2])
dca3 <- max(dca$rproj[,3])
dca4 <- max(dca$rproj[,4])
GL <- data.frame(DCA1 = c(dca1), DCA2 = c(dca2), DCA3 = c(dca3), DCA4 = c(dca4))
rownames(GL) <- c("Gradient length")
write.csv(GL, file = "results/Other_microbiome_analysis/dca.csv")

# redundancy analysis (RDA) or canonical correspondence analysis (CCA)
# RDA combines correspondence analysis with multiple regression analysis, and each step of the calculation is regressed with environmental factors.
# CCA is a ranking method based on a unimodal model and combines multiple environmental factors in the ranking process.
# RDA is based on a linear model, while CCA is based on a unimodal model. You can choose which analysis method gives you the best community ranking results.

design<-read.csv("data/test_design.csv",row.names = 1)
otu_tax<-read.csv("data/test_otu.csv",row.names = 1)

# Correlating microorganisms with environmental factors, CCA analysis
cca <- cca(t(otu_tax[200:300,1:24]), design[,6:11], scale = TRUE)
ccascore <- scores(cca)

# Or RDA analysis
RDA = rda(t(otu_tax[200:300,1:24]),design[,6:11], scale = TRUE)

# Get the first and second axes of RDA. Here, RDA is not visualized, but CCA is used as an example for visualization.
RDA_sample<- scores(RDA,choices = 1:2, display = 'sp')
RDA_env<-RDA$CCA$biplot[,1:2]

# Get the first and second axes of CCA
CCAE <- as.data.frame(cca$CCA$biplot[,1:2])
CCA1 <- ccascore$sites[,1]
CCA2 <- ccascore$sites[,2]

# Construct a data frame with the first and second axes of the samples
plotdata <- data.frame(rownames(ccascore$sites), CCA1, CCA2)
colnames(plotdata) <- c("sample","CCA1","CCA2")
plotdata<-cbind(plotdata,design)

# Get the explanation percentage of the first and second axes, and keep one decimal place
cca1 <- round(cca$CCA$eig[1]/sum(cca$CCA$eig)*100,1)
cca2 <- round(cca$CCA$eig[2]/sum(cca$CCA$eig)*100,1)

# Plot CCA results
p1 <- ggplot(plotdata,aes(x=CCA1,y=CCA2,color=plotdata$Treatment))+
  geom_point(size=4,aes())+
  stat_ellipse(aes(fill=plotdata$Treatment),geom = "polygon",size=0.6,level = 0.95,alpha = 0.1)+
  geom_segment(data=CCAE,aes(x = 0, y = 0, xend = CCAE[,1]*3.5, yend =  CCAE[,2]*3.5),
               arrow = arrow(length = unit(0.03, 'npc')),size =1,color="red")+
  geom_text(data = CCAE,aes(CCA1 * 4,CCA2 * 4,label = rownames(CCAE)),color = 'red',size = 4)+
  xlab(paste("CCA1 (",cca1,"%",")"))+ylab(paste("CCA2 (",cca2,"%",")"))+
  theme_bw(base_line_size = 1.05,base_rect_size = 1.05)+
  scale_color_manual(values=c("#3FBDA7","#0172B6","#BD3C29","#F0965D"))+
  scale_fill_manual(values=c("#3FBDA7","#0172B6","#BD3C29","#F0965D"))+
  theme(panel.grid.major=element_blank(),panel.grid.minor=element_blank())+
  geom_hline(aes(yintercept=0), colour="gray45",size=0.8, linetype="dashed")+
  geom_vline(aes(xintercept=0), colour="gray45",size=0.8, linetype="dashed")
p1

ggsave("results/Other_microbiome_analysis/CCA_plot.pdf", p1, width = 8, height = 6)

Hierarchical segmentation RDA/CCA(层次分割RDA/CCA)

# Load packages and data
library(rdacca.hp)
data(mite)  # Oribatid mite species abundance matrix at the observation site
mite[1:6,1:6]
  Brachy PHTH HPAV RARD SSTR Protopl
1     17    5    5    3    2       1
2      2    7   16    0    6       0
3      4    3    1    1    2       0
4     23    7   10    2    2       0
5      5    8   13    9    0      13
6     19    7    5    9    3       2
#write.csv(mite,"mite.csv")
data(mite.env)  # Environmental variable matrix of observation sites
#write.csv(mite.env,"miteenv.csv")
data(mite.xy)  # Geographic coordinates of the observation site
#write.csv(mite.xy,"mitexy.csv")

# RDA
# mite <- read.csv(file.choose(),header = T,row.names = 1)
mite.hel <- decostand(mite, method = 'hellinger')
mite.rda <- rda(mite.hel~., mite.env, scale = FALSE)
summary(mite.rda)

Call:
rda(formula = mite.hel ~ SubsDens + WatrCont + Substrate + Shrub +      Topo, data = mite.env, scale = FALSE) 

Partitioning of variance:
              Inertia Proportion
Total           0.394      1.000
Constrained     0.208      0.527
Unconstrained   0.187      0.473

Eigenvalues, and their contribution to the variance 

Importance of components:
                       RDA1   RDA2   RDA3   RDA4   RDA5    RDA6    RDA7    RDA8
Eigenvalue            0.137 0.0247 0.0138 0.0112 0.0083 0.00548 0.00211 0.00189
Proportion Explained  0.348 0.0626 0.0351 0.0284 0.0210 0.01391 0.00534 0.00480
Cumulative Proportion 0.348 0.4108 0.4459 0.4742 0.4953 0.50920 0.51454 0.51934
                         RDA9    RDA10   RDA11    PC1    PC2    PC3    PC4
Eigenvalue            0.00127 0.000983 0.00057 0.0427 0.0238 0.0172 0.0137
Proportion Explained  0.00323 0.002494 0.00144 0.1084 0.0605 0.0436 0.0347
Cumulative Proportion 0.52257 0.525060 0.52650 0.6349 0.6954 0.7390 0.7737
                         PC5     PC6     PC7     PC8     PC9    PC10    PC11
Eigenvalue            0.0116 0.00968 0.00769 0.00727 0.00674 0.00527 0.00478
Proportion Explained  0.0293 0.02456 0.01950 0.01844 0.01708 0.01336 0.01213
Cumulative Proportion 0.8030 0.82761 0.84711 0.86555 0.88263 0.89599 0.90812
                         PC12   PC13    PC14    PC15    PC16    PC17    PC18
Eigenvalue            0.00439 0.0040 0.00353 0.00294 0.00260 0.00251 0.00222
Proportion Explained  0.01115 0.0102 0.00895 0.00746 0.00658 0.00638 0.00563
Cumulative Proportion 0.91927 0.9294 0.93838 0.94583 0.95242 0.95879 0.96443
                         PC19    PC20    PC21   PC22   PC23    PC24     PC25
Eigenvalue            0.00210 0.00194 0.00171 0.0013 0.0011 0.00102 0.000923
Proportion Explained  0.00533 0.00491 0.00435 0.0033 0.0028 0.00258 0.002342
Cumulative Proportion 0.96976 0.97467 0.97901 0.9823 0.9851 0.98769 0.990031
                          PC26     PC27     PC28     PC29     PC30     PC31
Eigenvalue            0.000806 0.000662 0.000545 0.000478 0.000461 0.000367
Proportion Explained  0.002045 0.001680 0.001383 0.001211 0.001168 0.000932
Cumulative Proportion 0.992076 0.993756 0.995139 0.996351 0.997519 0.998451
                          PC32     PC33     PC34     PC35
Eigenvalue            0.000237 0.000178 0.000138 5.74e-05
Proportion Explained  0.000601 0.000451 0.000351 1.45e-04
Cumulative Proportion 0.999052 0.999504 0.999855 1.00e+00

Accumulated constrained eigenvalues
Importance of components:
                       RDA1   RDA2   RDA3   RDA4   RDA5    RDA6    RDA7    RDA8
Eigenvalue            0.137 0.0247 0.0138 0.0112 0.0083 0.00548 0.00211 0.00189
Proportion Explained  0.661 0.1188 0.0666 0.0539 0.0400 0.02642 0.01015 0.00912
Cumulative Proportion 0.661 0.7803 0.8469 0.9007 0.9407 0.96712 0.97727 0.98639
                         RDA9    RDA10   RDA11
Eigenvalue            0.00127 0.000983 0.00057
Proportion Explained  0.00613 0.004738 0.00274
Cumulative Proportion 0.99252 0.997256 1.00000
plot(mite.rda)

# Hierarchical segmentation
mite.rda.hp <- rdacca.hp(mite.hel, mite.env, method = 'RDA', type = 'adjR2', scale = FALSE)
mite.rda.hp
$Method_Type
[1] "RDA"   "adjR2"

$Total_explained_variation
[1] 0.437

$Hier.part
          Unique Average.share Individual I.perc(%)
SubsDens  0.0363       -0.0043     0.0320      7.32
WatrCont  0.1009        0.0633     0.1642     37.57
Substrate 0.0416        0.0127     0.0543     12.43
Shrub     0.0288        0.0730     0.1018     23.30
Topo      0.0465        0.0378     0.0843     19.29

attr(,"class")
[1] "rdaccahp"
plot(mite.rda.hp)

# Save hierarchical segmentation results
mite.rda.hp$Hier.part
          Unique Average.share Individual I.perc(%)
SubsDens  0.0363       -0.0043     0.0320      7.32
WatrCont  0.1009        0.0633     0.1642     37.57
Substrate 0.0416        0.0127     0.0543     12.43
Shrub     0.0288        0.0730     0.1018     23.30
Topo      0.0465        0.0378     0.0843     19.29
write.csv(mite.rda.hp$Hier.part, 'results/Other_microbiome_analysis/mite.rda.hp.csv')

# CCA
mite.cca <- cca(mite~., mite.env)
summary(mite.cca)

Call:
cca(formula = mite ~ SubsDens + WatrCont + Substrate + Shrub +      Topo, data = mite.env) 

Partitioning of scaled Chi-square:
              Inertia Proportion
Total           1.696      1.000
Constrained     0.799      0.471
Unconstrained   0.897      0.529

Eigenvalues, and their contribution to the scaled Chi-square 

Importance of components:
                      CCA1  CCA2   CCA3   CCA4   CCA5   CCA6   CCA7    CCA8
Eigenvalue            0.44 0.132 0.0737 0.0453 0.0397 0.0231 0.0179 0.01059
Proportion Explained  0.26 0.078 0.0434 0.0267 0.0234 0.0136 0.0106 0.00625
Cumulative Proportion 0.26 0.338 0.3811 0.4078 0.4312 0.4448 0.4554 0.46165
                         CCA9   CCA10   CCA11    CA1    CA2    CA3    CA4
Eigenvalue            0.00827 0.00581 0.00209 0.1280 0.1185 0.0966 0.0786
Proportion Explained  0.00488 0.00342 0.00123 0.0755 0.0699 0.0569 0.0463
Cumulative Proportion 0.46653 0.46995 0.47119 0.5467 0.6165 0.6735 0.7198
                         CA5    CA6    CA7    CA8    CA9   CA10   CA11   CA12
Eigenvalue            0.0632 0.0546 0.0427 0.0345 0.0317 0.0290 0.0269 0.0230
Proportion Explained  0.0372 0.0322 0.0252 0.0203 0.0187 0.0171 0.0159 0.0135
Cumulative Proportion 0.7570 0.7892 0.8144 0.8347 0.8534 0.8706 0.8864 0.9000
                        CA13   CA14   CA15   CA16    CA17    CA18    CA19
Eigenvalue            0.0202 0.0187 0.0166 0.0156 0.01375 0.01259 0.01225
Proportion Explained  0.0119 0.0111 0.0098 0.0092 0.00811 0.00742 0.00722
Cumulative Proportion 0.9119 0.9229 0.9327 0.9419 0.95002 0.95744 0.96467
                         CA20    CA21    CA22    CA23    CA24    CA25    CA26
Eigenvalue            0.00964 0.00782 0.00726 0.00607 0.00539 0.00515 0.00418
Proportion Explained  0.00568 0.00461 0.00428 0.00358 0.00318 0.00304 0.00246
Cumulative Proportion 0.97035 0.97496 0.97924 0.98282 0.98600 0.98904 0.99150
                         CA27    CA28   CA29    CA30     CA31     CA32     CA33
Eigenvalue            0.00361 0.00284 0.0022 0.00192 0.001620 0.001012 0.000860
Proportion Explained  0.00213 0.00167 0.0013 0.00113 0.000955 0.000597 0.000507
Cumulative Proportion 0.99363 0.99530 0.9966 0.99773 0.998690 0.999287 0.999794
                          CA34
Eigenvalue            0.000349
Proportion Explained  0.000206
Cumulative Proportion 1.000000

Accumulated constrained eigenvalues
Importance of components:
                       CCA1  CCA2   CCA3   CCA4   CCA5   CCA6   CCA7   CCA8
Eigenvalue            0.440 0.132 0.0737 0.0453 0.0397 0.0231 0.0179 0.0106
Proportion Explained  0.551 0.166 0.0922 0.0567 0.0497 0.0289 0.0224 0.0133
Cumulative Proportion 0.551 0.717 0.8088 0.8655 0.9152 0.9441 0.9665 0.9798
                         CCA9   CCA10   CCA11
Eigenvalue            0.00827 0.00581 0.00209
Proportion Explained  0.01035 0.00727 0.00261
Cumulative Proportion 0.99012 0.99739 1.00000
# plot(mite.cca)
# Hierarchical segmentation
set.seed(123)
mite.cca.hp <- rdacca.hp(mite, mite.env, method = 'CCA', type = 'adjR2', scale = FALSE, n.perm = 1000)
mite.cca.hp
$Method_Type
[1] "CCA"   "adjR2"

$Total_explained_variation
[1] 0.338

$Hier.part
          Unique Average.share Individual I.perc(%)
SubsDens  0.0283        0.0061     0.0344      10.2
WatrCont  0.0871        0.0567     0.1438      42.5
Substrate 0.0423       -0.0015     0.0408      12.1
Shrub     0.0154        0.0571     0.0725      21.4
Topo      0.0257        0.0207     0.0464      13.7

attr(,"class")
[1] "rdaccahp"
plot(mite.cca.hp)

# Save hierarchical segmentation results
mite.cca.hp$Hier.part
          Unique Average.share Individual I.perc(%)
SubsDens  0.0283        0.0061     0.0344      10.2
WatrCont  0.0871        0.0567     0.1438      42.5
Substrate 0.0423       -0.0015     0.0408      12.1
Shrub     0.0154        0.0571     0.0725      21.4
Topo      0.0257        0.0207     0.0464      13.7
write.csv(mite.cca.hp$Hier.part, 'results/Other_microbiome_analysis/mite.cca.hp.csv')

# db-RDA
mite.bray <- vegdist(mite, method = 'bray')
mite.cap <- dbrda(mite.bray~., mite.env)
summary(mite.cap)

Call:
dbrda(formula = mite.bray ~ SubsDens + WatrCont + Substrate +      Shrub + Topo, data = mite.env) 

Partitioning of squared Bray distance:
              Inertia Proportion
Total           14.70      1.000
Constrained      7.42      0.505
Unconstrained    7.27      0.495

Eigenvalues, and their contribution to the squared Bray distance 

Importance of components:
                      dbRDA1 dbRDA2 dbRDA3 dbRDA4 dbRDA5 dbRDA6 dbRDA7  dbRDA8
Eigenvalue             4.603 0.9157 0.6740 0.4007 0.2560 0.2451 0.1516 0.10267
Proportion Explained   0.313 0.0623 0.0459 0.0273 0.0174 0.0167 0.0103 0.00699
Cumulative Proportion  0.313 0.3755 0.4214 0.4486 0.4660 0.4827 0.4930 0.50002
                      dbRDA9 dbRDA10  dbRDA11  MDS1   MDS2  MDS3   MDS4   MDS5
Eigenvalue            0.0573 0.01640 1.22e-03 1.977 0.9078 0.750 0.6545 0.5565
Proportion Explained  0.0039 0.00112 8.29e-05 0.135 0.0618 0.051 0.0445 0.0379
Cumulative Proportion 0.5039 0.50504 5.05e-01 0.640 0.7014 0.752 0.7970 0.8348
                        MDS6   MDS7   MDS8   MDS9  MDS10  MDS11  MDS12  MDS13
Eigenvalue            0.4746 0.4329 0.3899 0.3268 0.2802 0.2448 0.2248 0.1931
Proportion Explained  0.0323 0.0295 0.0265 0.0222 0.0191 0.0167 0.0153 0.0131
Cumulative Proportion 0.8671 0.8966 0.9231 0.9453 0.9644 0.9811 0.9964 1.0095
                       MDS14  MDS15  MDS16   MDS17  MDS18   MDS19  MDS20
Eigenvalue            0.1727 0.1570 0.1338 0.11045 0.1088 0.10508 0.0896
Proportion Explained  0.0118 0.0107 0.0091 0.00752 0.0074 0.00715 0.0061
Cumulative Proportion 1.0213 1.0319 1.0410 1.04856 1.0560 1.06311 1.0692
                        MDS21   MDS22   MDS23   MDS24   MDS25   MDS26   MDS27
Eigenvalue            0.08357 0.07386 0.05919 0.04980 0.04607 0.04064 0.03607
Proportion Explained  0.00569 0.00503 0.00403 0.00339 0.00313 0.00277 0.00245
Cumulative Proportion 1.07489 1.07992 1.08395 1.08734 1.09047 1.09324 1.09569
                        MDS28   MDS29   MDS30   MDS31    MDS32    MDS33   MDS34
Eigenvalue            0.02908 0.02638 0.01856 0.01629 0.010561 0.005755 0.00205
Proportion Explained  0.00198 0.00179 0.00126 0.00111 0.000719 0.000392 0.00014
Cumulative Proportion 1.09767 1.09946 1.10073 1.10184 1.102554 1.102945 1.10309
                          iMDS1    iMDS2     iMDS3    iMDS4    iMDS5    iMDS6
Eigenvalue            -0.003869 -0.00544 -0.011558 -0.01721 -0.02214 -0.02548
Proportion Explained  -0.000263 -0.00037 -0.000786 -0.00117 -0.00151 -0.00173
Cumulative Proportion  1.102822  1.10245  1.101665  1.10049  1.09899  1.09725
                         iMDS7    iMDS8    iMDS9   iMDS10   iMDS11   iMDS12
Eigenvalue            -0.02838 -0.03452 -0.04007 -0.04078 -0.04661 -0.05027
Proportion Explained  -0.00193 -0.00235 -0.00273 -0.00278 -0.00317 -0.00342
Cumulative Proportion  1.09532  1.09297  1.09025  1.08747  1.08430  1.08088
                        iMDS13   iMDS14  iMDS15   iMDS16   iMDS17   iMDS18
Eigenvalue            -0.05255 -0.05617 -0.0617 -0.06241 -0.06648 -0.07303
Proportion Explained  -0.00358 -0.00382 -0.0042 -0.00425 -0.00452 -0.00497
Cumulative Proportion  1.07730  1.07348  1.0693  1.06504  1.06051  1.05555
                        iMDS19   iMDS20   iMDS21   iMDS22   iMDS23 iMDS24
Eigenvalue            -0.08062 -0.08916 -0.10482 -0.11435 -0.13417 -0.293
Proportion Explained  -0.00549 -0.00607 -0.00713 -0.00778 -0.00913 -0.020
Cumulative Proportion  1.05006  1.04399  1.03686  1.02908  1.01995  1.000

Accumulated constrained eigenvalues
Importance of components:
                      dbRDA1 dbRDA2 dbRDA3 dbRDA4 dbRDA5 dbRDA6 dbRDA7 dbRDA8
Eigenvalue              4.60  0.916 0.6740  0.401 0.2560  0.245 0.1516 0.1027
Proportion Explained    0.62  0.123 0.0908  0.054 0.0345  0.033 0.0204 0.0138
Cumulative Proportion   0.62  0.743 0.8342  0.888 0.9226  0.956 0.9761 0.9899
                       dbRDA9 dbRDA10  dbRDA11
Eigenvalue            0.05735 0.01640 0.001218
Proportion Explained  0.00773 0.00221 0.000164
Cumulative Proportion 0.99763 0.99984 1.000000
# plot(mite.cap)
# Hierarchical segmentation
mite.cap.hp <- rdacca.hp(mite.bray, mite.env, method = 'dbRDA', type = 'adjR2', scale = FALSE)
mite.cap.hp
$Method_Type
[1] "dbRDA" "adjR2"

$Total_explained_variation
[1] 0.356

$Hier.part
          Unique Average.share Individual I.perc(%)
SubsDens  0.0296       -0.0055     0.0241      6.77
WatrCont  0.0749        0.0464     0.1213     34.07
Substrate 0.0474        0.0059     0.0533     14.97
Shrub     0.0315        0.0538     0.0853     23.96
Topo      0.0420        0.0305     0.0725     20.37

attr(,"class")
[1] "rdaccahp"
plot(mite.cap.hp)

# Save hierarchical segmentation results
mite.cap.hp$Hier.part
          Unique Average.share Individual I.perc(%)
SubsDens  0.0296       -0.0055     0.0241      6.77
WatrCont  0.0749        0.0464     0.1213     34.07
Substrate 0.0474        0.0059     0.0533     14.97
Shrub     0.0315        0.0538     0.0853     23.96
Topo      0.0420        0.0305     0.0725     20.37
write.csv(mite.cap.hp$Hier.part, 'results/Other_microbiome_analysis/mite.cap.hp.csv')

# Significance test of variable contribution in hierarchical segmentation
# Taking the results of RDA as an example, the significance of the interpretability of each environmental factor is obtained based on 999 permutation tests.
set.seed(123)
permu_hp <- permu.hp(dv = mite.hel, iv = mite.env, method = 'RDA', type = 'adjR2', permutations = 999)

Please wait: running 998 permutations 
permu_hp
          Individual    Pr(>I)
SubsDens      0.0320 0.018   *
WatrCont      0.1642 0.001 ***
Substrate     0.0543 0.017   *
Shrub         0.1018 0.001 ***
Topo          0.0843 0.001 ***
# Save results
write.csv(permu_hp, 'results/Other_microbiome_analysis/permu_hp.csv')
 
# Plot
permu_hp$Variables <- rownames(permu_hp)
permu_hp$p <- unlist(lapply(as.character(permu_hp$'Pr(>I)'), function(x) unlist(strsplit(x, ' '))[2]))
 
library(ggplot2)
ggplot(permu_hp, aes(Variables, Individual)) +
  geom_col() +
  geom_text(aes(label = p), vjust = -0.3)

non-restricted cluster analysis based on dmm - search for community response factors(基于dmm进行非限制性聚类分析-寻找群落响应的因子)

# Load packages
library(BiocManager)
# BiocManager::install("microbiome")
# BiocManager::install("DirichletMultinomial")
# BiocManager::install("reshape2")
# BiocManager::install("magrittr",force = TRUE)
library(dplyr)
library(microbiome)
library(DirichletMultinomial)
library(reshape2)
library(magrittr)

# Load data
data("dietswap")
# Contains 52 microorganisms, 222 samples, and the sample information file has 8 columns
pseq<-dietswap 
# Only core groups are selected for analysis
pseq.comp<-microbiome::transform(pseq,"compositional") 
# The core group indicator should be a relative abundance of 0.1% in 50% of the samples
taxa<-core_members(pseq.comp,detection = 0.1/100,prevalence = 50/100) 

pseq<-prune_taxa(taxa,pseq) # Remove unwanted OTUs or groups from the phylogroup
# There are three analysis groups here, and we model according to this preset divided into three parts
map = sample_data(pseq)
head(map)
         subject    sex nationality group   sample timepoint
Sample-1     byn   male         AAM    DI Sample-1         4
Sample-2     nms   male         AFR    HE Sample-2         2
Sample-3     olt   male         AFR    HE Sample-3         2
Sample-4     pku female         AFR    HE Sample-4         2
Sample-5     qjy female         AFR    HE Sample-5         2
Sample-6     riv female         AFR    HE Sample-6         2
         timepoint.within.group  bmi_group
Sample-1                      1      obese
Sample-2                      1       lean
Sample-3                      1 overweight
Sample-4                      1      obese
Sample-5                      1 overweight
Sample-6                      1      obese
map$group %>% unique()
[1] DI HE ED
Levels: DI ED HE
# The richness of core species in 222 samples, behavioral species names, listed as sample corresponding richness
dat<-microbiome::abundances(pseq)
# Transpose the abundance data of the core group and convert it into a matrix, with the rows being the number of samples and the columns being the different species names.
count<-as.matrix(t(dat))
# lapply function calls function dmn to fit a polynomial model to the sample count matrix
fit<-lapply(1:4,dmn,count=count,verbose=TRUE)
  Soft kmeans
  Expectation Maximization setup
  Expectation Maximization
  Hessian
  Soft kmeans
    iteration 10 change 0.000080
  Expectation Maximization setup
  Expectation Maximization
    iteration 10 change 0.000000
  Hessian
  Soft kmeans
    iteration 10 change 0.000418
  Expectation Maximization setup
  Expectation Maximization
    iteration 10 change 0.000063
  Hessian
  Soft kmeans
    iteration 10 change 0.000259
    iteration 20 change 0.000004
  Expectation Maximization setup
  Expectation Maximization
    iteration 10 change 0.414533
    iteration 20 change 0.014936
  Hessian
# Judging the fitting effect
lplc<-sapply(fit,laplace) # laplace gets the parameters of the fitted model
aic<-sapply(fit,AIC)
# When estimating parameters using the maximum likelihood method, the smaller the AIC and BIC, the better the fit.
bic<-sapply(fit,BIC) 
# Select best model
# unlist selects the model with the smallest parameter from all parameter lists
best<-fit[[which.min(unlist(lplc))]]
# Get the parameter pi and theta values of the best fitting model
mixturewt(best)
     pi theta
1 0.324  80.2
2 0.265 119.5
3 0.234  71.4
4 0.177 338.0
# Save best model results
ass<-apply(mixture(best),1,which.max)
write.csv(ass, file="results/Other_microbiome_analysis/DMM_3clusters_L6.csv")

for(k in seq(ncol(fitted(best)))){
  d<-melt(fitted(best))
  colnames(d)<-c("OTU","cluster","value")
  d <- subset(d, cluster == k) %>%
    # Arrange OTUs by assignment strength
    arrange(value) %>%
    mutate(OTU = factor(OTU, levels = unique(OTU))) %>%
    filter(abs(value) > quantile(abs(value), 0.8))
  p <- ggplot(d, aes(x = OTU, y = value)) +
    geom_bar(stat = "identity") +
    coord_flip() + 
  labs(title = paste("Top drivers: community type", k))
  print(p)
}

d
                                    OTU cluster  value
1        Sporobacter termitidis et rel.       4   2.42
2      Subdoligranulum variable at rel.       4   2.57
3            Ruminococcus obeum et rel.       4   2.59
4        Butyrivibrio crossotus et rel.       4   2.77
5         Clostridium symbiosum et rel.       4   3.10
6         Clostridium cellulosi et rel.       4   3.29
7          Bacteroides vulgatus et rel.       4   4.17
8    Oscillospira guillermondii et rel.       4   9.03
9  Faecalibacterium prausnitzii et rel.       4   9.83
10            Prevotella oralis et rel.       4  32.80
11    Prevotella melaninogenica et rel.       4 224.42
# How to choose the best k value
data(fit)
lplc <- sapply(fit, laplace)
#plot(lplc, type="b")
fit[[which.min(lplc)]]
class: DMN 
k: 4 
samples x taxa: 278 x 130 
Laplace: 38781 BIC: 40425 AIC: 39477 
lplc2 <- as.data.frame(lplc)
lplc2$cluster <- rownames(lplc2)
p01_DMM <- ggplot(data = lplc2, aes(x=cluster,y=lplc, group = 1))+
  geom_point()+
  geom_line(color = "lightblue")+
  xlab("Number of clusters k")+  
  ylab("Laplace approximation")+  
  theme_classic() + 
  geom_vline(xintercept = 4, colour='black', lwd=0.36, linetype="dashed")+
  theme(panel.grid.major=element_line(colour=NA),
        panel.background = element_rect(fill = "transparent",colour = NA),
        plot.background = element_rect(fill = "transparent",colour = NA),
        plot.title = element_text(hjust = 0.5,size = 15),
        panel.grid.minor = element_blank(),   
        text = element_text(family = "sans"),  
        axis.text.x = element_text(hjust = 0.5,size = 10), 
        axis.text.y = element_text(hjust = 0.5,size = 10),
        axis.title.y = element_text(size = 15), 
        axis.title.x = element_text(size = 15), 
        legend.text = element_text(size = 15),
        legend.position = c(.92,.72),  
        legend.box.background = element_rect(color="black"))#+ 
  #scale_x_continuous(limits = c(2000,2014),breaks = seq(2000,2014,1))  
#(p01_DMM)
p01_DMM

Microbial communities + Community environmental drivers - Chart combination(微生物群落和群落环境驱动因子组合图表)

# load data
design<-read.csv("data/test_design.csv",row.names = 1)
otu<-read.csv("data/test_otu.csv",row.names = 1)

# load packages
library(linkET)
library(ggplot2)
library(dplyr)
library(vegan)
library(reshape2)
library(RColorBrewer)
library(dplyr)

# Extract out and subsequently calculate beta diversity
Bacteria<-as.data.frame(t(otu[1:200,1:24]))

# Extract environmental factors. This is a random setting with no practical significance. Environmental factors or biological factors can be added based on personal data.。
env_beta<-otu[840:863,7:19]
colnames(env_beta)<-c("SOC","TN","AP","TP","TK","AK","NO3","Lat","Lon","MAT",
                      "MAP","pH","Fe")

# Adjust data for visualization
env_beta[1:12,1:8]=env_beta[1:12,1:8]+10
env_beta[1:12,9:13]=env_beta[1:12,9:13]-8

# Perform mantel analysis to obtain mantel p-value and r-value, and perform subsequent visualization
mantel <- mantel_test(Bacteria, env_beta,spec_select = list(Bacteria=1:200)) %>% 
  mutate(rd = cut(r, breaks = c(-Inf, 0.2, 0.4, Inf),labels = c("< 0.2", "0.2 - 0.4", ">= 0.4")),
         pd = cut(p, breaks = c(-Inf, 0.01, 0.05, Inf),labels = c("< 0.01", "0.01 - 0.05", ">= 0.05")))

# Visualization, add the upper triangular matrix of environmental factors through qcorrplot(correlate(env_beta)
p1=qcorrplot(correlate(env_beta),type = "upper",diag = F)+
  geom_square()+ 
  geom_couple(aes(colour = pd,size = rd),data = mantel,curvature = nice_curvature())+
  scale_size_manual(values = c(1.5,3,4.5))+ 
  scale_fill_gradientn(colours = RColorBrewer::brewer.pal(11, "RdBu"))+
  scale_colour_manual(values = c("#8B30B5","#5AB328","gray88"))+ 
  guides(size = guide_legend(title = "Mantel's r",override.aes = list(colour = "grey35"),order = 2),
         colour = guide_legend(title = "Mantel's p", override.aes = list(size = 3),order = 1),
         fill = guide_colorbar(title = "Pearson's r", order = 3))


# Draw a heat map and select data for visualization
pp<-otu[1011:1024,1:10]
pp[pp>=8]=100
pp[pp>=7&pp<=8]=50

# When designing microbial and environmental factor data, it is assumed that it is a correlation coefficient. In fact, you need to calculate the correlation coefficient yourself (based on the significance results of correlation analysis Spearman and Pearson's r value)
rownames(pp)<-c(paste("Bacteria",1:14))
colnames(pp)<-c(paste("Soil",1:10))

# Convert to a long list required for ggplot drawing
io<-pp%>% mutate(pp=row.names(.))%>%melt()
df<-cbind(io,io$value)

# Add significance and mark it with an asterisk. You actually need to add it yourself (based on the p-value of Spearman and Pearson of the relevant analysis significance results). This is only for drawing, and the data is meaningless.
colnames(df)<-c("pp","variable","value","sig")

# Draw a heat map, add a square color system through geom_point, set the color gradient through scale_fill_gradientn, and set the border and color through geom_tile.
p2=ggplot(df,aes(variable,pp,fill=value))+
  geom_tile(color="gray42",fill="white",size=0.7)+
  geom_point(pch=22,color="white",size=10.8)+
  geom_vline(aes(xintercept =0.5), size=1.3,  colour="red")+
  geom_vline(aes(xintercept =10.5), size=1.3,  colour="red")+
  geom_hline(aes(yintercept =0.5), size=1.3,  colour="red")+
  geom_hline(aes(yintercept =14.5), size=1.3,  colour="red")+
  theme(panel.background = element_blank(),legend.position = "none")+
  theme(axis.text=element_text(colour='black',size=9))+
  scale_fill_gradientn(colours =colorRampPalette(c('#CCCCCC','#339966','#FF9900'))(99))+
  labs(x = NULL,y = NULL,fill="")

# Patchwork
library(cowplot)
p3 <- cowplot::plot_grid(p1, p2 ,ncol= 2, rel_widths = c(1.5, 1))
p3

ggsave("results/Other_microbiome_analysis/Chart_combination01.pdf", p3, width = 13, height = 6)

The relationships between environmental factors and high-abundance microorganisms(高丰度微生物和环境因子的关系探索)

cor_env_ggcorplot <- function(
  env1 = env1,
  env2 = env2,
  label =  T,
  col_cluster = T,
  row_cluster = T,
  method = "spearman",
  r.threshold=0.6,
  p.threshold=0.05,
  theme.size = 10
  ){
  if (dim(env2)[2] == 1) {
    env2 = env2
    } else {
    env2 <- env2[match(row.names(env1),row.names(env2)),]
    }
  env0 <- cbind(env1,env2)
  occor = psych::corr.test(env0,use="pairwise",method=method,adjust="fdr",alpha=.05)
  occor.r = occor$r
  occor.p = occor$p
  occor.r[occor.p > p.threshold&abs(occor.r) < r.threshold] = 0
  head(env0)
  # data[data > 0.3]<-0.3
  #drop gene column as now in rows
  if (col_cluster) {
    clust <- hclust(dist(env1 %>% as.matrix()%>% t())) # hclust with distance matrix
    ggtree_plot <- ggtree::ggtree(clust)
  }
  if (row_cluster) {
    v_clust <- hclust(dist(env2 %>% as.matrix() %>% t()))
    ggtree_plot_col <- ggtree::ggtree(v_clust) + ggtree::layout_dendrogram()
  }
  occor.r = as.data.frame(occor.r)
  if (dim(env2)[2] == 1) {
    data <- occor.r[colnames(env1),colnames(env2)]
    data = data.frame(row.names = colnames(env1),data)
    colnames(data) = colnames(env2)
    data$id = row.names(data)
  } else {
    data <- occor.r[colnames(env1),colnames(env2)]
    data$id = row.names(data)
  }
  pcm = reshape2::melt(data, id = c("id"))
  head(pcm)
  occor.p = as.data.frame(occor.p)
  if (dim(env2)[2] == 1) {
    data <- occor.p[colnames(env1),colnames(env2)]
    data = data.frame(row.names = colnames(env1),data)
    colnames(data) = colnames(env2)
    data$id = row.names(data)
  } else {
    data <- occor.p[colnames(env1),colnames(env2)]
    data$id = row.names(data)
    
  }
  pcm2 = reshape2::melt(data, id = c("id"))
  head(pcm2)
  colnames(pcm2)[3] = "p"
  pcm2$lab = pcm2$p 
  pcm2$lab[pcm2$lab < 0.001] = "**"
  pcm2$lab[pcm2$lab < 0.05] = "*"
  pcm2$lab[pcm2$lab >= 0.05] = ""
  pcm3 = pcm %>% left_join(pcm2)
  p1 = ggplot(pcm3, aes(y = id, x = variable)) + 
    # geom_point(aes(size = value,fill = value), alpha = 0.75, shape = 21) + 
    geom_tile(aes(size = value,fill = value))+
    scale_size_continuous(limits = c(0.000001, 100), range = c(2,25), breaks = c(0.1,0.5,1)) + 
    geom_text(aes(label = lab)) +
    labs( y= "", x = "", size = "Relative Abundance (%)", fill = "")  + 
    # scale_fill_manual(values = colours, guide = FALSE) + 
    scale_x_discrete(limits = rev(levels(pcm$variable)))  + 
    scale_y_discrete(position = "right") +
    scale_fill_gradientn(colours =colorRampPalette(c("#377EB8","#F7F4F9","#E41A1C"))(60)) +
    theme(
      panel.background=element_blank(),
      panel.grid=element_blank(),
      axis.text.x = element_text(colour = "black",size = theme.size,angle = 60,vjust = 1,hjust = 1)
    )
  p2 = ggplot(pcm3, aes(y = id, x = variable)) + 
    geom_point(aes(size = value,fill = value), alpha = 0.75, shape = 21) + 
    scale_size_continuous(limits = c(0.000001, 100), range = c(2,25), breaks = c(0.1,0.5,1)) + 
    geom_text(aes(label = lab)) +
    labs( y= "", x = "", size = "Relative Abundance (%)", fill = "")  + 
    # scale_fill_manual(values = colours, guide = FALSE) + 
    scale_x_discrete(limits = rev(levels(pcm$variable)))  + 
    scale_y_discrete(position = "right")  +
    scale_fill_gradientn(colours =colorRampPalette(c("#377EB8","#F7F4F9","#E41A1C"))(60))  +
    theme(
      panel.background=element_blank(),
      panel.grid=element_blank(),
      axis.text.x = element_text(colour = "black",size = theme.size,angle = 60,vjust = 1,hjust = 1)
    )
  if (col_cluster) {
    p1 <- p1  %>%
      aplot::insert_left(ggtree_plot, width=.2) 
    p2 <- p2  %>%
      aplot::insert_left(ggtree_plot, width=.2) 
  }
  if (label) {
    p1 <- p1  %>%
      aplot::insert_top(ggtree_plot_col, height=.1)
    p2 <- p2  %>%
      aplot::insert_top(ggtree_plot_col, height=.1)
  }
  return(list(p1,p2))
}

# Load data
metadata = read.delim("./data/data.ps/metadata.tsv")
row.names(metadata) = metadata$SampleID
otutab = read.table("./data/data.ps/otutab.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
taxonomy = read.table("./data/data.ps/taxonomy.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)

otutab2 = apply(otutab, 2, function(x) x/sum(x))
otutab2 <- as.data.frame(otutab2)
otutab2$OTU <- rownames(otutab2)

taxonomy <- as.data.frame(taxonomy)
taxonomy$OTU <- rownames(taxonomy)

otutab3 = merge(taxonomy, otutab2, by = "OTU")
rownames(otutab3) <- otutab3$OTU

otutab_phylum <- otutab3[, c(3, 9:26)]

# sum of phylum
otutab_phylum <- aggregate(.~ Phylum, data = otutab_phylum, sum)
rownames(otutab_phylum) = otutab_phylum$Phylum
otutab_phylum = otutab_phylum[, -1]

#otu = phyloseq::otu_table(psdata)
otu = otutab_phylum 
#tax = phyloseq::tax_table(psdata)

Top = 10  
if (dim(otu)[1] < Top) {
    top10 <- otu[names(sort(rowSums(otu), decreasing = TRUE)[1:dim(otu)[1]]),]
    top10 = t(top10)
  } else {
    top10 <- otu[names(sort(rowSums(otu), decreasing = TRUE)[1:Top]),]
    top10 = t(top10)
  }
head(top10)
    Proteobacteria Actinobacteria Bacteroidetes Firmicutes Chloroflexi
KO1          0.659          0.255        0.0294    0.01634     0.01640
KO2          0.500          0.405        0.0262    0.02002     0.02142
KO3          0.608          0.281        0.0746    0.01508     0.00623
KO4          0.621          0.282        0.0305    0.03232     0.01819
KO5          0.738          0.172        0.0509    0.00733     0.01439
KO6          0.694          0.236        0.0233    0.01572     0.01108
    Unassigned Acidobacteria Verrucomicrobia Planctomycetes Spirochaetes
KO1    0.01539       0.00369        0.002012       0.000671     0.000701
KO2    0.01678       0.00350        0.002070       0.002461     0.000447
KO3    0.00659       0.00244        0.001480       0.001739     0.001168
KO4    0.01079       0.00191        0.001008       0.000398     0.000928
KO5    0.01103       0.00179        0.000984       0.000725     0.001450
KO6    0.01110       0.00366        0.001942       0.000902     0.000684
env = read.csv("./data/dataNEW/env.csv")
head(env)
       ID   pH  SOC   TN NH4.N NO3.N   AP   AK    CN   LA Height    TG   RGR
1 sample1 4.45 7.96 0.70  6.86  9.45 36.2 74.2 11.37 35.6     63 0.609 0.863
2 sample2 4.55 9.12 0.89  5.78 10.71 49.5 68.5 10.24 46.8     67 0.625 0.976
3 sample3 4.47 7.58 0.92  6.27 11.85 50.8 68.9  8.24 44.6     48 0.886 1.340
4 sample4 4.63 8.66 0.65  5.49 10.22 55.7 58.7 13.28 39.7     53 0.644 0.801
5 sample5 4.38 9.59 0.74  6.55  9.84 44.4 60.8 13.04 50.7     55 0.591 0.665
6 sample6 4.52 9.88 0.63  5.66  9.73 39.9 70.4 15.74 42.5     55 0.751 1.145
    LB  SB  RB   R.S  FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2  LRL3  LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7   19   41   96 31.7  132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5   30   47   56 34.5  145  84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5   26   41  110 26.4  108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9   22   33   84 25.3  153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9   28   51  103 28.6  132  77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4   25   42  131 26.8  220 211.1 0.241
    LRD2  LRD3 MaxO TLRN TLRL   BI   MID
1 0.0655 0.058    4  156  296 1.54 0.619
2 0.0700 0.069    5  133  263 1.80 1.780
3 0.0660 0.066    6  177  316 2.67 1.373
4 0.0940 0.086    5  139  335 1.37 0.811
5 0.0680 0.071    7  182  238 1.88 0.700
6 0.0700 0.068    4  198  457 1.73 0.602
envRDA = env
head(env)
       ID   pH  SOC   TN NH4.N NO3.N   AP   AK    CN   LA Height    TG   RGR
1 sample1 4.45 7.96 0.70  6.86  9.45 36.2 74.2 11.37 35.6     63 0.609 0.863
2 sample2 4.55 9.12 0.89  5.78 10.71 49.5 68.5 10.24 46.8     67 0.625 0.976
3 sample3 4.47 7.58 0.92  6.27 11.85 50.8 68.9  8.24 44.6     48 0.886 1.340
4 sample4 4.63 8.66 0.65  5.49 10.22 55.7 58.7 13.28 39.7     53 0.644 0.801
5 sample5 4.38 9.59 0.74  6.55  9.84 44.4 60.8 13.04 50.7     55 0.591 0.665
6 sample6 4.52 9.88 0.63  5.66  9.73 39.9 70.4 15.74 42.5     55 0.751 1.145
    LB  SB  RB   R.S  FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2  LRL3  LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7   19   41   96 31.7  132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5   30   47   56 34.5  145  84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5   26   41  110 26.4  108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9   22   33   84 25.3  153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9   28   51  103 28.6  132  77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4   25   42  131 26.8  220 211.1 0.241
    LRD2  LRD3 MaxO TLRN TLRL   BI   MID
1 0.0655 0.058    4  156  296 1.54 0.619
2 0.0700 0.069    5  133  263 1.80 1.780
3 0.0660 0.066    6  177  316 2.67 1.373
4 0.0940 0.086    5  139  335 1.37 0.811
5 0.0680 0.071    7  182  238 1.88 0.700
6 0.0700 0.068    4  198  457 1.73 0.602
row.names(envRDA) = env$ID
envRDA$ID = NULL
head(envRDA)
          pH  SOC   TN NH4.N NO3.N   AP   AK    CN   LA Height    TG   RGR   LB
sample1 4.45 7.96 0.70  6.86  9.45 36.2 74.2 11.37 35.6     63 0.609 0.863 48.4
sample2 4.55 9.12 0.89  5.78 10.71 49.5 68.5 10.24 46.8     67 0.625 0.976 70.1
sample3 4.47 7.58 0.92  6.27 11.85 50.8 68.9  8.24 44.6     48 0.886 1.340 61.1
sample4 4.63 8.66 0.65  5.49 10.22 55.7 58.7 13.28 39.7     53 0.644 0.801 74.7
sample5 4.38 9.59 0.74  6.55  9.84 44.4 60.8 13.04 50.7     55 0.591 0.665 58.2
sample6 4.52 9.88 0.63  5.66  9.73 39.9 70.4 15.74 42.5     55 0.751 1.145 70.4
         SB  RB   R.S  FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2  LRL3  LRD1
sample1 275 315 0.975 91.4 35.3 29.4 26.7   19   41   96 31.7  132 131.9 0.188
sample2 213 380 1.340 93.1 36.1 29.5 27.5   30   47   56 34.5  145  84.2 0.126
sample3 253 351 1.117 89.7 30.0 31.2 28.5   26   41  110 26.4  108 181.3 0.179
sample4 220 309 1.048 96.9 38.5 30.6 27.9   22   33   84 25.3  153 156.6 0.208
sample5 309 395 1.078 89.4 32.6 29.8 26.9   28   51  103 28.6  132  77.3 0.135
sample6 255 356 1.095 92.3 33.6 30.3 28.4   25   42  131 26.8  220 211.1 0.241
          LRD2  LRD3 MaxO TLRN TLRL   BI   MID
sample1 0.0655 0.058    4  156  296 1.54 0.619
sample2 0.0700 0.069    5  133  263 1.80 1.780
sample3 0.0660 0.066    6  177  316 2.67 1.373
sample4 0.0940 0.086    5  139  335 1.37 0.811
sample5 0.0680 0.071    7  182  238 1.88 0.700
sample6 0.0700 0.068    4  198  457 1.73 0.602
env1 = envRDA
env2 = top10
env2 = as.data.frame(env2)
env2$sample <- rownames(env1)
rownames(env2) <- env2$sample
env2 <- env2[, -11]

result = cor_env_ggcorplot(
    env1 = env1,
    env2 = env2,
    label =  TRUE,
    col_cluster = TRUE,
    row_cluster = TRUE,
    method = "spearman",
    r.threshold= 0,
    p.threshold= 0
  )

p1 <- result[[1]] 
p1

p2 <- result[[2]]
p2

hei = dim(env)[2]/5
wid = Top
  
filename = paste("results/Other_microbiome_analysis/Top10_Phylum_abundacne_OTU.csv",sep = "")
write.csv(top10,filename)
  
filename = paste("results/Other_microbiome_analysis/ggheatmap.pdf",sep = "")
ggsave(filename,p1,width = Top/2,height = dim(env)[2]/5)
filename = paste("results/Other_microbiome_analysis/ggbubble.pdf",sep = "")
ggsave(filename,p2,width = Top/2,height = dim(env)[2]/5)
filename = paste("results/Other_microbiome_analysis/ggheatmap2.jpg",sep = "")
ggsave(filename,p1,width = Top/2,height = dim(env)[2]/5)
filename = paste("results/Other_microbiome_analysis/ggbubble2.jpg",sep = "")
ggsave(filename,p2,width = Top/2,height = dim(env)[2]/5)

Exploring the relationship between the large magnitude of change microorganisms and environmental factors(变化幅度大的微生物和环境因子的关系探索)

# Load data
metadata = read.delim("./data/data.ps/metadata.tsv")
row.names(metadata) = metadata$SampleID
otutab = read.table("./data/data.ps/otutab.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
taxonomy = read.table("./data/data.ps/taxonomy.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)

otutab2 = apply(otutab, 2, function(x) x/sum(x))
otutab2 <- as.data.frame(otutab2)
otutab2$OTU <- rownames(otutab2)

taxonomy <- as.data.frame(taxonomy)
taxonomy$OTU <- rownames(taxonomy)

otutab3 = merge(taxonomy, otutab2, by = "OTU")
rownames(otutab3) <- otutab3$OTU

otutab_phylum <- otutab3[, c(3, 9:26)]

# sum of phylum
otutab_phylum <- aggregate(.~ Phylum, data = otutab_phylum, sum)
rownames(otutab_phylum) = otutab_phylum$Phylum
otutab_phylum = otutab_phylum[, -1]

#otu = phyloseq::otu_table(psdata)
otu = otutab_phylum 
#tax = phyloseq::tax_table(psdata)

tran = TRUE
Top = 10

rowSD = function(x){
    apply(x,1, sd)
  }
  
rowCV = function(x){
    rowSD(x)/rowMeans(x)
}

id <- otu %>%
    as.data.frame() %>% 
    rowCV %>%
    sort(decreasing = TRUE) %>%
    head(Top) %>%
    names()

data = otu[id,] %>% t() %>%
    as.data.frame()

env = read.csv("./data/dataNEW/env.csv")
head(env)
       ID   pH  SOC   TN NH4.N NO3.N   AP   AK    CN   LA Height    TG   RGR
1 sample1 4.45 7.96 0.70  6.86  9.45 36.2 74.2 11.37 35.6     63 0.609 0.863
2 sample2 4.55 9.12 0.89  5.78 10.71 49.5 68.5 10.24 46.8     67 0.625 0.976
3 sample3 4.47 7.58 0.92  6.27 11.85 50.8 68.9  8.24 44.6     48 0.886 1.340
4 sample4 4.63 8.66 0.65  5.49 10.22 55.7 58.7 13.28 39.7     53 0.644 0.801
5 sample5 4.38 9.59 0.74  6.55  9.84 44.4 60.8 13.04 50.7     55 0.591 0.665
6 sample6 4.52 9.88 0.63  5.66  9.73 39.9 70.4 15.74 42.5     55 0.751 1.145
    LB  SB  RB   R.S  FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2  LRL3  LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7   19   41   96 31.7  132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5   30   47   56 34.5  145  84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5   26   41  110 26.4  108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9   22   33   84 25.3  153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9   28   51  103 28.6  132  77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4   25   42  131 26.8  220 211.1 0.241
    LRD2  LRD3 MaxO TLRN TLRL   BI   MID
1 0.0655 0.058    4  156  296 1.54 0.619
2 0.0700 0.069    5  133  263 1.80 1.780
3 0.0660 0.066    6  177  316 2.67 1.373
4 0.0940 0.086    5  139  335 1.37 0.811
5 0.0680 0.071    7  182  238 1.88 0.700
6 0.0700 0.068    4  198  457 1.73 0.602
envRDA = env
head(env)
       ID   pH  SOC   TN NH4.N NO3.N   AP   AK    CN   LA Height    TG   RGR
1 sample1 4.45 7.96 0.70  6.86  9.45 36.2 74.2 11.37 35.6     63 0.609 0.863
2 sample2 4.55 9.12 0.89  5.78 10.71 49.5 68.5 10.24 46.8     67 0.625 0.976
3 sample3 4.47 7.58 0.92  6.27 11.85 50.8 68.9  8.24 44.6     48 0.886 1.340
4 sample4 4.63 8.66 0.65  5.49 10.22 55.7 58.7 13.28 39.7     53 0.644 0.801
5 sample5 4.38 9.59 0.74  6.55  9.84 44.4 60.8 13.04 50.7     55 0.591 0.665
6 sample6 4.52 9.88 0.63  5.66  9.73 39.9 70.4 15.74 42.5     55 0.751 1.145
    LB  SB  RB   R.S  FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2  LRL3  LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7   19   41   96 31.7  132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5   30   47   56 34.5  145  84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5   26   41  110 26.4  108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9   22   33   84 25.3  153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9   28   51  103 28.6  132  77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4   25   42  131 26.8  220 211.1 0.241
    LRD2  LRD3 MaxO TLRN TLRL   BI   MID
1 0.0655 0.058    4  156  296 1.54 0.619
2 0.0700 0.069    5  133  263 1.80 1.780
3 0.0660 0.066    6  177  316 2.67 1.373
4 0.0940 0.086    5  139  335 1.37 0.811
5 0.0680 0.071    7  182  238 1.88 0.700
6 0.0700 0.068    4  198  457 1.73 0.602
row.names(envRDA) = env$ID
envRDA$ID = NULL
head(envRDA)
          pH  SOC   TN NH4.N NO3.N   AP   AK    CN   LA Height    TG   RGR   LB
sample1 4.45 7.96 0.70  6.86  9.45 36.2 74.2 11.37 35.6     63 0.609 0.863 48.4
sample2 4.55 9.12 0.89  5.78 10.71 49.5 68.5 10.24 46.8     67 0.625 0.976 70.1
sample3 4.47 7.58 0.92  6.27 11.85 50.8 68.9  8.24 44.6     48 0.886 1.340 61.1
sample4 4.63 8.66 0.65  5.49 10.22 55.7 58.7 13.28 39.7     53 0.644 0.801 74.7
sample5 4.38 9.59 0.74  6.55  9.84 44.4 60.8 13.04 50.7     55 0.591 0.665 58.2
sample6 4.52 9.88 0.63  5.66  9.73 39.9 70.4 15.74 42.5     55 0.751 1.145 70.4
         SB  RB   R.S  FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2  LRL3  LRD1
sample1 275 315 0.975 91.4 35.3 29.4 26.7   19   41   96 31.7  132 131.9 0.188
sample2 213 380 1.340 93.1 36.1 29.5 27.5   30   47   56 34.5  145  84.2 0.126
sample3 253 351 1.117 89.7 30.0 31.2 28.5   26   41  110 26.4  108 181.3 0.179
sample4 220 309 1.048 96.9 38.5 30.6 27.9   22   33   84 25.3  153 156.6 0.208
sample5 309 395 1.078 89.4 32.6 29.8 26.9   28   51  103 28.6  132  77.3 0.135
sample6 255 356 1.095 92.3 33.6 30.3 28.4   25   42  131 26.8  220 211.1 0.241
          LRD2  LRD3 MaxO TLRN TLRL   BI   MID
sample1 0.0655 0.058    4  156  296 1.54 0.619
sample2 0.0700 0.069    5  133  263 1.80 1.780
sample3 0.0660 0.066    6  177  316 2.67 1.373
sample4 0.0940 0.086    5  139  335 1.37 0.811
sample5 0.0680 0.071    7  182  238 1.88 0.700
sample6 0.0700 0.068    4  198  457 1.73 0.602
env1 = envRDA
env2 = data
env2 = as.data.frame(env2)
env2$sample <- rownames(env1)
rownames(env2) <- env2$sample
env2 <- env2[, -11]

result = cor_env_ggcorplot(
    env1 = env1,
    env2 = env2,
    label =  TRUE,
    col_cluster = TRUE,
    row_cluster = TRUE,
    method = "spearman",
    r.threshold= 0,
    p.threshold= 0
    )
  
p1 <- result[[1]] 
p1

p2 <- result[[2]]
p2

hei = dim(env)[2]/5
wid = Top
  
filename = paste("results/Other_microbiome_analysis/Top10_Phylum_abundacne_OTU2.csv",sep = "")
write.csv(data,filename)
  
filename = paste("results/Other_microbiome_analysis/ggheatmap3.pdf",sep = "")
ggsave(filename,p1,width = Top/2,height = dim(env)[2]/5)
filename = paste("results/Other_microbiome_analysis/ggbubble3.pdf",sep = "")
ggsave(filename,p2,width = Top/2,height = dim(env)[2]/5)
filename = paste("results/Other_microbiome_analysis/ggheatmap4.jpg",sep = "")
ggsave(filename,p1,width = Top/2,height = dim(env)[2]/5)
filename = paste("results/Other_microbiome_analysis/ggbubble4.jpg",sep = "")
ggsave(filename,p2,width = Top/2,height = dim(env)[2]/5)

Analysis of the relationship between network modules and environmental factors(基于网络模块和环境因子的关系分析)

library(phyloseq)
library(ggClusterNet)
library(tidyverse)

conflicts_prefer(ggClusterNet::cor)

env = read.csv("./data/dataNEW/env.csv")
head(env)
       ID   pH  SOC   TN NH4.N NO3.N   AP   AK    CN   LA Height    TG   RGR
1 sample1 4.45 7.96 0.70  6.86  9.45 36.2 74.2 11.37 35.6     63 0.609 0.863
2 sample2 4.55 9.12 0.89  5.78 10.71 49.5 68.5 10.24 46.8     67 0.625 0.976
3 sample3 4.47 7.58 0.92  6.27 11.85 50.8 68.9  8.24 44.6     48 0.886 1.340
4 sample4 4.63 8.66 0.65  5.49 10.22 55.7 58.7 13.28 39.7     53 0.644 0.801
5 sample5 4.38 9.59 0.74  6.55  9.84 44.4 60.8 13.04 50.7     55 0.591 0.665
6 sample6 4.52 9.88 0.63  5.66  9.73 39.9 70.4 15.74 42.5     55 0.751 1.145
    LB  SB  RB   R.S  FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2  LRL3  LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7   19   41   96 31.7  132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5   30   47   56 34.5  145  84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5   26   41  110 26.4  108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9   22   33   84 25.3  153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9   28   51  103 28.6  132  77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4   25   42  131 26.8  220 211.1 0.241
    LRD2  LRD3 MaxO TLRN TLRL   BI   MID
1 0.0655 0.058    4  156  296 1.54 0.619
2 0.0700 0.069    5  133  263 1.80 1.780
3 0.0660 0.066    6  177  316 2.67 1.373
4 0.0940 0.086    5  139  335 1.37 0.811
5 0.0680 0.071    7  182  238 1.88 0.700
6 0.0700 0.068    4  198  457 1.73 0.602
envRDA = env
head(env)
       ID   pH  SOC   TN NH4.N NO3.N   AP   AK    CN   LA Height    TG   RGR
1 sample1 4.45 7.96 0.70  6.86  9.45 36.2 74.2 11.37 35.6     63 0.609 0.863
2 sample2 4.55 9.12 0.89  5.78 10.71 49.5 68.5 10.24 46.8     67 0.625 0.976
3 sample3 4.47 7.58 0.92  6.27 11.85 50.8 68.9  8.24 44.6     48 0.886 1.340
4 sample4 4.63 8.66 0.65  5.49 10.22 55.7 58.7 13.28 39.7     53 0.644 0.801
5 sample5 4.38 9.59 0.74  6.55  9.84 44.4 60.8 13.04 50.7     55 0.591 0.665
6 sample6 4.52 9.88 0.63  5.66  9.73 39.9 70.4 15.74 42.5     55 0.751 1.145
    LB  SB  RB   R.S  FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2  LRL3  LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7   19   41   96 31.7  132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5   30   47   56 34.5  145  84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5   26   41  110 26.4  108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9   22   33   84 25.3  153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9   28   51  103 28.6  132  77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4   25   42  131 26.8  220 211.1 0.241
    LRD2  LRD3 MaxO TLRN TLRL   BI   MID
1 0.0655 0.058    4  156  296 1.54 0.619
2 0.0700 0.069    5  133  263 1.80 1.780
3 0.0660 0.066    6  177  316 2.67 1.373
4 0.0940 0.086    5  139  335 1.37 0.811
5 0.0680 0.071    7  182  238 1.88 0.700
6 0.0700 0.068    4  198  457 1.73 0.602
row.names(envRDA) = env$ID
envRDA$ID = NULL
head(envRDA)
          pH  SOC   TN NH4.N NO3.N   AP   AK    CN   LA Height    TG   RGR   LB
sample1 4.45 7.96 0.70  6.86  9.45 36.2 74.2 11.37 35.6     63 0.609 0.863 48.4
sample2 4.55 9.12 0.89  5.78 10.71 49.5 68.5 10.24 46.8     67 0.625 0.976 70.1
sample3 4.47 7.58 0.92  6.27 11.85 50.8 68.9  8.24 44.6     48 0.886 1.340 61.1
sample4 4.63 8.66 0.65  5.49 10.22 55.7 58.7 13.28 39.7     53 0.644 0.801 74.7
sample5 4.38 9.59 0.74  6.55  9.84 44.4 60.8 13.04 50.7     55 0.591 0.665 58.2
sample6 4.52 9.88 0.63  5.66  9.73 39.9 70.4 15.74 42.5     55 0.751 1.145 70.4
         SB  RB   R.S  FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2  LRL3  LRD1
sample1 275 315 0.975 91.4 35.3 29.4 26.7   19   41   96 31.7  132 131.9 0.188
sample2 213 380 1.340 93.1 36.1 29.5 27.5   30   47   56 34.5  145  84.2 0.126
sample3 253 351 1.117 89.7 30.0 31.2 28.5   26   41  110 26.4  108 181.3 0.179
sample4 220 309 1.048 96.9 38.5 30.6 27.9   22   33   84 25.3  153 156.6 0.208
sample5 309 395 1.078 89.4 32.6 29.8 26.9   28   51  103 28.6  132  77.3 0.135
sample6 255 356 1.095 92.3 33.6 30.3 28.4   25   42  131 26.8  220 211.1 0.241
          LRD2  LRD3 MaxO TLRN TLRL   BI   MID
sample1 0.0655 0.058    4  156  296 1.54 0.619
sample2 0.0700 0.069    5  133  263 1.80 1.780
sample3 0.0660 0.066    6  177  316 2.67 1.373
sample4 0.0940 0.086    5  139  335 1.37 0.811
sample5 0.0680 0.071    7  182  238 1.88 0.700
sample6 0.0700 0.068    4  198  457 1.73 0.602
ps = readRDS("./data/dataNEW/ps_16s.rds")

id = sample_data(ps)$Group %>% unique()
id
[1] "Group1" "Group2" "Group3"
i = 1

res1path <- "results/Other_microbiome_analysis/"

for (i in 1:length(id)) {
  netpath = paste(res1path,"network_env_hub_",id[i],"/",sep = "")
  dir.create(netpath)
  # ps.1 = phyloseq::subset_samples(
  #   ps,Group %in% c(id[i])
  # )
  ps.1 = ps %>% scale_micro("TMM") %>%
    subset_samples(
      Group %in% c(id[i])
    )
  library(ggClusterNet)
  library(igraph)
  #--计算微生物网络相关矩阵(Calculation of microbial network correlation matrix)
  result= ggClusterNet::cor_Big_micro(ps = ps.1,
                                      N = 500,
                                      p.threshold = 0.05,
                                      r.threshold = 0.8,
                                      scale = FALSE)
  cor = result[[1]]
  #--拟合模块(Fitting module)
  tem <- model_maptree(cor =result[[1]],
                       method = "cluster_fast_greedy",
                       seed = 12
  )
  node_model = tem[[2]]
  head(node_model)
  otu = ps.1 %>%
    phyloseq::subset_taxa(
      row.names(tax_table(ps ))%in%c(row.names(result[[1]]))) %>%
    vegan_otu() %>%
    as.data.frame()
  #-对其
  node_model = node_model[match(colnames(otu),node_model$ID),]
  MEList = WGCNA::moduleEigengenes(otu, colors = node_model$group)
  MEs = MEList$eigengenes
  tablename <- paste(netpath,"./model_network_feature_value_",id[i],".csv",sep = "")
  write.csv(MEs,tablename)
  #--寻找对于某个环境因子作用最大的模块
  #--Find the module that has the greatest effect on a certain environmental factor
 ramdom_Model_env.plot <- function(
    model = model,
    sink = env,
    seed = 1
){
  model$ID = row.names(model)
  set.seed(seed)
  tem.r = sink %>% inner_join(model,by = "ID") %>%
    select(-ID)
  frichness.rfP <- rfPermute::rfPermute(tem.r[[1]] ~., data=tem.r[,-1],
                                        ntree = 500, na.action = na.omit, nrep = 100, num.cores = 1)
  frimp.scaled1 <- rfPermute::importance(frichness.rfP, scale = TRUE)%>% round(3)
  frimp.scaled1 <- frimp.scaled1[,1:2]
  df<-cbind(as.data.frame(frimp.scaled1),rownames(frimp.scaled1))
  head(df)
  df$`%IncMSE.pval`[is.na(df$`%IncMSE.pval`)] = 1
  i = 1
  a =  c()
  for (i in 1:length(df$`%IncMSE.pval`)) {
    if (df$`%IncMSE.pval`[i] > 0.05) {
      a[i] = ""
    } else {
      a[i] = "*"
    }
  }
  df$lab = a
  df$`%IncMSE`[is.na(df$`%IncMSE`)] = 0
  p <- ggplot(df, aes(x =`%IncMSE` , y =reorder(`rownames(frimp.scaled1)`,`%IncMSE`) )) + 
    geom_bar(stat = "identity", width = 0.75,position = "dodge",colour="black",fill="#9ACD32",alpha=1) + 
    geom_text(aes(label = lab),hjust = -1) +
    labs(y="Model in network", x="%IncMSE", title = colnames(sink)[2],size=9)+
    theme_bw() +
    theme(axis.text=element_text(colour='black',size=9))
  return(list(df,p))
}
  env.1 = env %>% filter(ID %in% sample_names(ps.1))
  result <- ramdom_Model_env.plot(model = MEs,
                                  sink =env.1[,1:2] )
  p <- result[[2]]
  p
  data = result[[1]]
  head(data)
  hit <- dim(MEs)[2]
  hit
  FileName <- paste(netpath,"./ranImportant_Model_order_", id[i],".pdf", sep = "")
  ggsave(FileName, p,width = 6,height =hit/5)
  FileName <- paste(netpath,"./ranImportant_Model_order", id[i], ".csv", sep = "")
  write.csv(data,FileName)
  nGenes = ncol(otu)
  nSamples = nrow(otu)
  moduleTraitCor = cor(MEs, envRDA[sample_names(ps.1),], use = "p")
  moduleTraitPvalue = WGCNA::corPvalueStudent(moduleTraitCor, nSamples)
  #sizeGrWindow(10,6)
  # dim(MEs)[2]/2
  # dim(envRDA)[2]/2
  pdf(file=paste(netpath,"./","Module-env_relationships.pdf",sep = ""),width=dim(envRDA)[2]/2,height=dim(MEs)[2]/2)
  # Will display correlations and their p-values
  textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                     signif(moduleTraitPvalue, 1), ")", sep = "")
  
  dim(textMatrix) = dim(moduleTraitCor)
  par(mar = c(6, 8.5, 3, 3))
  # Display the correlation values within a heatmap plot
  WGCNA::labeledHeatmap(Matrix = moduleTraitCor,
                        xLabels = names(envRDA),
                        yLabels = names(MEs),
                        ySymbols = names(MEs),
                        colorLabels = FALSE,
                        colors = WGCNA::greenWhiteRed(50),
                        textMatrix = textMatrix,
                        setStdMargins = FALSE,
                        cex.text = 0.5,
                        zlim = c(-1,1),
                        main = paste("Module-trait relationships"))
  dev.off()
  #关键微生物与理化关系(Key microorganisms and physicochemical relationships)#--
  # NetmodelEnv =  paste(res1path,"/HUB_micro_env/",sep = "")
  # dir.create(NetmodelEnv)
  igraph = make_igraph(cor)
  tem = 10
  hub = hub_score(igraph)$vector %>%
    sort(decreasing = TRUE) %>%
    head(tem) %>%
    as.data.frame()
  colnames(hub) = "hub_sca"
  p = ggplot(hub) +
    geom_bar(aes(x = hub_sca,y = reorder(row.names(hub),hub_sca)),stat = "identity",fill = "#4DAF4A")
  p
  FileName <- paste(netpath,"./hub_micro", ".pdf", sep = "")
  ggsave(FileName, p,width = 6,height =tem/2)
  FileName <- paste(netpath,"./hub_micro", ".csv", sep = "")
  write.csv(hub,FileName)
  id.2 = row.names(hub)
  # ps.1 = ps %>% scale_micro("TMM") %>%
  otu = phyloseq::otu_table(ps.1)
  tax = phyloseq::tax_table(ps.1)
  head(otu)
  data = otu[id.2,] %>% t() %>%
    as.data.frame()
  result = cor_env_ggcorplot(
    env1 = envRDA[sample_names(ps.1),],
    env2 = data,
    label =  F,
    col_cluster = F,
    row_cluster = F,
    method = "spearman",
    r.threshold= 0.5,
    p.threshold= 0
  )
  p1 <- result[[1]] 
  p1
  p2 <- result[[2]]
  p2
  hei = dim(env)[2]/5
  # 
  # filename = paste(NetmodelEnv,"hum_env.csv",sep = "")
  # write.csv(top10,filename)
  filename = paste(netpath,"hum_env.pdf",sep = "")
  ggsave(filename,p1,width = tem/2,height = dim(env)[2]/5)
  filename = paste(netpath,"hum_env.pdf",sep = "")
  ggsave(filename,p2,width = tem/2,height = dim(env)[2]/5)
  
  filename = paste(netpath,"hum_env.jpg",sep = "")
  ggsave(filename,p1,width = tem/2,height = dim(env)[2]/5)
  filename = paste(netpath,"hum_env.jpg",sep = "")
  ggsave(filename,p2,width = tem/2,height = dim(env)[2]/5)
  
}

Network analysis of bacterial and environmental factors(细菌和环境因子网络分析)

res1path <- "results/Other_microbiome_analysis/"
Envnetplot<- paste(res1path,"./Env_network",sep = "")
dir.create(Envnetplot)

ps16s = readRDS("./data/dataNEW/ps_16s.rds")%>% ggClusterNet::scale_micro()
psITS = NULL

library(phyloseq)
conflicted::conflicts_prefer(ggplot2::theme_void)

#--细菌和真菌ps对象中的map文件要一样(The map files in the bacteria and fungi PS objects must be the same)
ps.merge <- ggClusterNet::merge16S_ITS(ps16s = ps16s,
                                       psITS= psITS,
                         NITS = 200,
                         N16s = 200)

map =  phyloseq::sample_data(ps.merge)
# head(map)
map$Group = "one"
phyloseq::sample_data(ps.merge) <- map

#--环境因子导入(Environmental factors introduction)
data1 = env
envRDA.s = vegan::decostand(envRDA,"hellinger")
data1[,-1] = envRDA.s
Gru = data.frame(ID = colnames(env)[-1],group = "env" )
head(Gru)
     ID group
1    pH   env
2   SOC   env
3    TN   env
4 NH4.N   env
5 NO3.N   env
6    AP   env
library(sna)
library(ggClusterNet)
library(igraph)
result <- ggClusterNet::corBionetwork(ps = ps.merge,
                        N = 0,
                        r.threshold = 0.6, 
                        p.threshold = 0.05,
                        big = TRUE,
                        group = "Group",
                        env = data1, 
                        envGroup = Gru,
                        layout_net = "model_maptree2",
                        path = Envnetplot,
                        fill = "Phylum", 
                        size = "igraph.degree", 
                        scale = TRUE, 
                        bio = TRUE, 
                        zipi = FALSE,
                        step = 100,
                        width = 18,
                        label = TRUE,
                        height = 10
)
[1] "one"
 num [1:34, 1:18] 0.0474 0.0633 0.0188 0.0588 0.069 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:34] "pH" "SOC" "TN" "NH4.N" ...
  ..$ : chr [1:18] "sample1" "sample10" "sample11" "sample12" ...
[1] "1"
[1] "2"
[1] "3"
p = result[[1]]
p

# 全部样本网络参数比对(Comparison of all sample network parameters)
data = result[[2]]
plotname1 = paste(Envnetplot,"/network_all.jpg",sep = "")
ggsave(plotname1, p,width = 15,height = 12,dpi = 72)
plotname1 = paste(Envnetplot,"/network_all.png",sep = "")
ggsave(plotname1, p,width = 10,height = 8,dpi = 72)
plotname1 = paste(Envnetplot,"/network_all.pdf",sep = "")
ggsave(plotname1, p,width = 15,height = 12)
tablename <- paste(Envnetplot,"/co-occurrence_Grobel_net",".csv",sep = "")
write.csv(data,tablename)

Correlation analysis of network attributes and modules with environmental factors(网络属性-模块与环境因子相关分析)

net.property_env = function(
    ps = ps,
    corpath = corpath,
    Top = 500,
    r.threshold= 0.8,
    p.threshold=0.05,
    env = env,
    select.mod = c("model_1","model_2","model_3"),
    select.env = "pH"
  ){
  map = sample_data(ps)
  id3 = map$Group %>% unique()
  for (m in 1:length(id3)) {
  #---全部模块的微生物网络#——------
    pst = ps %>%
      filter_taxa(function(x) sum(x ) > 0, TRUE) %>%
      scale_micro("rela") %>%
     subset_samples.wt("Group",id3[m]) %>%
      filter_OTU_ps(Top)
    
    result = cor_Big_micro(ps = pst,
                           N = 0,
                           r.threshold= r.threshold,
                           p.threshold= p.threshold,
                           method = "spearman")
    cor = result[[1]]
    head(cor)
    # igraph = make_igraph(cor)
    #--计算模块信息,部分OTU没有模块,注意去除
    #--Calculation module information, some OTUs have no modules, please remove them
    # netClu  = modulGroup( cor = cor,cut = NULL,method = "cluster_fast_greedy" )
    # head(netClu)
    # result2 = model_maptree_group(cor = cor,
    #                               nodeGroup = netClu,
    # )
    result2 = model_maptree2(cor = cor, method = "cluster_fast_greedy")
    #----多功能性和环境因子等和模块相关(Multifunctionality and environmental factors are related to modules)#-----
    # 第一种是模块特征向量(The first is the module feature vector)
    #--基于模块的OTU,计算在不同分组中的总丰度zscore 并统计检验#-------
    # Based on the module's OTU, calculate the total abundance zscore in different groups and perform statistical tests
    select.mod = select.mod
    mod1 = result2[[2]]
    head(mod1)
    tem = mod1$group %>% table() %>% 
      as.data.frame() %>% 
      dplyr::arrange(desc(Freq))
    colnames(tem) = c("Model","OTU.num")
    head(tem)
    if (length(select.mod) == 1 & is.numeric(select.mod)) {
      select.mod.name = tem$Model[1:select.mod]
      mod1 = mod1 %>% filter(!group == "mother_no",
                             group %in%c(select.mod.name)
      ) %>% select(ID,group,degree) 
    } else if (is.character(select.mod)) {
      select.mod.name = select.mod
      mod1 = mod1 %>% filter(!group == "mother_no",
                             group %in%c(select.mod.name)
      ) %>% select(ID,group,degree) 
    }
    id.s = mod1$group %>% unique()
    for (i in 1:length(id.s)) {
      id.t =  mod1 %>% 
        dplyr::filter(group %in% id.s[i]) %>%
        .$ID
      ps.t = ps %>% 
        scale_micro() %>%
        subset_taxa.wt("OTU", id.t )
      otu = ps.t %>% 
        vegan_otu() %>%
        t()
      colSD = function(x){
        apply(x,2, sd)
      }
      dat = (otu - colMeans(otu))/colSD(otu) 
      head(dat)
      otu_table(ps.t) = otu_table(as.matrix(dat),taxa_are_rows = TRUE)
      #--计算总丰度(Calculate total abundance)
      otu = ps.t %>%  vegan_otu() %>% t()
      colSums(otu)
      dat = data.frame(id = names(colSums(otu)),abundance.zscore = colSums(otu))
      colnames(dat)[2] = id.s[i]
      if (i ==1) {
        tem = dat
      } else{
        dat$id = NULL
        tem = cbind(tem,dat)
      }
    }
    head(tem)
    map =sample_data(ps.t)
    map$id = row.names(map)
    map = map[,c("id","Group")]
    data = map %>%
      as.tibble() %>%
      inner_join(tem,by = "id") %>%
      dplyr::rename(group = Group)
    
    colnames(env)[1] = "id"
    # env1$id = row.names(env1)
    subenv = env %>% dplyr::select(id,everything()) %>% select(id,select.env )
    head(data)
    tab = data %>% left_join(subenv,by = "id")
    head(tab)
    library(reshape2)
    mtcars2 = melt(tab, id.vars=c(select.env,"group","id"))
    mtcars2$variable
    head(mtcars2)
    lab = mean(mtcars2[,select.env])
    p1_1 = ggplot2::ggplot(mtcars2,aes(x= value,!!sym(select.env), colour=variable)) +
      ggplot2::geom_point() +
      ggpubr::stat_cor(label.y=lab*1.1)+
      ggpubr::stat_regline_equation(label.y=lab*1.1,vjust = 2) +
      facet_wrap(~variable, scales="free_x") +
      geom_smooth(aes(value,!!sym(select.env), colour=variable), method=lm, se=T)+
      theme_classic()
    p1_1
    filename = paste(corpath,"/cor_netowrkmodel",select.env,".pdf",sep = "")
    ggsave(filename,p1_1,width = 16,height = 18)
    
    #-多功能性和网络属性相关(Versatility and network properties)#---------
    igraph = make_igraph(cor)
    dat = igraph::V(igraph)
    names(dat) %>% length()
    #--弄清楚每个样本包含的OTU数量
    #--Find out the number of OTUs contained in each sample
    # pst =  ps %>%
    #   scale_micro("rela") %>%
    #   phyloseq::subset_samples(Group %in% c("KO","WT","OE")) %>%
    #   filter_OTU_ps(500) 
    print("1")
    # otu = pst %>% 
    #   # phyloseq::subset_samples(Group %in% c("KO","WT","OE")) %>%
    #   # filter_OTU_ps(500) %>%
    #   subset_taxa(row.names(tax_table(pst)) %in% names(dat)) %>%
    #   vegan_otu() %>% 
    #   t() 
    # print("1")
    otu = pst %>% vegan_otu() %>% t()
    otu = otu[row.names(otu) %in% names(dat),]
    otu[otu > 1] = 1
    dim(otu)
    A = list()
    dat.f = NULL
    
    for (i in 1:length(colnames(otu))) {
      tem = otu[,colnames(otu)[i]][otu[,colnames(otu)[i]] > 0 ] %>% names()
      A[[colnames(otu)[i]]] = tem
      #-计算性质(Computational properties)
      tem.2 = A[[colnames(otu)[i]]]
      tem.g = igraph::induced_subgraph(igraph,tem.2)
      dat = net_properties.2(tem.g,n.hub = FALSE)
      head(dat,n = 16)
      
      dat[16,1] = 0
      dat = as.data.frame(dat)
      dat$value = as.numeric(dat$value)
      colnames(dat) = colnames(otu)[i]
      if (i == 1) {
        dat.f = dat
      } else {
        dat.f = cbind(dat.f,dat)
      }
    }
    head(dat.f)
   
    dat.f = dat.f %>% 
      t() %>% 
      as.data.frame()
    select.env = select.env
    # env1$id = row.names(env1)
    # env1 = env1 %>% dplyr::select(id,everything()) %>% select(id,select.env )
    head(dat.f)
    dat.f$id = row.names(dat.f)
    dat.f = dat.f %>% dplyr:: select(id,everything())
    tab = dat.f %>% left_join(subenv,by = "id")
    head(tab)
    mtcars2 = melt(tab, id.vars=c(select.env,"id"))
    lab = mean(mtcars2[,select.env])
    head(mtcars2)
    p0_1 = ggplot2::ggplot(mtcars2,aes(x= value,!!sym(select.env), colour=variable)) +
      ggplot2::geom_point() +
      ggpubr::stat_cor(label.y=lab*1.1)+
      ggpubr::stat_regline_equation(label.y=lab*1.1,vjust = 2) +
      facet_wrap(~variable, scales="free_x") +
      geom_smooth(aes(value,!!sym(select.env), colour=variable), method=lm, se=T)+
      theme_classic()
    p0_1
    filename = paste(corpath,"/cor_netowrkpropertities",select.env,".pdf",sep = "")
    ggsave(filename,p0_1,width = 16,height = 18)
  }}

res1path <- "results/Other_microbiome_analysis/"

library(tidyverse)
library(igraph)
corpath = paste(res1path,"./env_difference_plot/",sep = "")
dir.create(corpath)

ps = readRDS("./data/dataNEW/ps_16s.rds")

net.property_env(
  ps = ps,
  corpath = corpath,
  Top = 500,
  r.threshold= 0.8,
  p.threshold=0.05,
  env = env,
  select.mod = c("model_1","model_2","model_3"),
  select.env = "pH")
[1] "1"
[1] "1"
[1] "1"

Co-occurrence network of bacterial-fungal-environmental factors(细菌-真菌-环境因子三者共现网络)

res1path <- "results/Other_microbiome_analysis/"

Envnetplot<- paste(res1path,"./16s_ITS_Env_network",sep = "")
dir.create(Envnetplot)

ps16s = readRDS("./data/dataNEW//ps_16s.rds") %>% ggClusterNet::scale_micro()
psITS = readRDS("./data/dataNEW//ps_ITS.rds")%>% ggClusterNet::scale_micro()

library(phyloseq)
#--细菌和真菌ps对象中的map文件要一样
ps.merge <- ggClusterNet::merge16S_ITS(ps16s = ps16s,
                                       psITS = psITS,
                                       NITS = 200,
                                       N16s = 200)

ps.merge
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 400 taxa and 18 samples ]
sample_data() Sample Data:       [ 18 samples by 2 sample variables ]
tax_table()   Taxonomy Table:    [ 400 taxa by 8 taxonomic ranks ]
map =  phyloseq::sample_data(ps.merge)
# head(map)
map$Group = "one"
phyloseq::sample_data(ps.merge) <- map

data1 = env

envRDA.s = vegan::decostand(envRDA,"hellinger")
data1[,-1] = envRDA.s

Gru = data.frame(ID = colnames(env)[-1],group = "env" )
head(Gru)
     ID group
1    pH   env
2   SOC   env
3    TN   env
4 NH4.N   env
5 NO3.N   env
6    AP   env
library(sna)
library(ggClusterNet)
library(igraph)

result <- ggClusterNet::corBionetwork(ps = ps.merge,
                                      N = 0,
                                      r.threshold = 0.6, # 相关阈值
                                      p.threshold = 0.05,
                                      big = TRUE,
                                      group = "Group",
                                      env = data1, # 环境指标表格
                                      envGroup = Gru,# 环境因子分组文件表格
                                      layout_net = "model_maptree2",
                                      path = Envnetplot,# 结果文件存储路径
                                      fill = "Phylum", # 出图点填充颜色用什么值
                                      size = "igraph.degree", # 出图点大小用什么数据
                                      scale = TRUE, # 是否要进行相对丰度标准化
                                      bio = TRUE, # 是否做二分网络
                                      zipi = FALSE, # 是否计算ZIPI
                                      step = 100, # 随机网络抽样的次数
                                      width = 18,
                                      label = TRUE,
                                      height = 10
)
[1] "one"
 num [1:34, 1:18] 0.0474 0.0633 0.0188 0.0588 0.069 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:34] "pH" "SOC" "TN" "NH4.N" ...
  ..$ : chr [1:18] "sample1" "sample10" "sample11" "sample12" ...
[1] "1"
[1] "2"
[1] "3"
p = result[[1]]
p

# 全部样本网络参数比对
data = result[[2]]
plotname1 = paste(Envnetplot,"/network_all.jpg",sep = "")
ggsave(plotname1, p,width = 15,height = 12,dpi = 72)
plotname1 = paste(Envnetplot,"/network_all.png",sep = "")
ggsave(plotname1, p,width = 10,height = 8,dpi = 72)
plotname1 = paste(Envnetplot,"/network_all.pdf",sep = "")
ggsave(plotname1, p,width = 15,height = 12)
tablename <- paste(Envnetplot,"/co-occurrence_Grobel_net",".csv",sep = "")
write.csv(data,tablename)

Bacterial-fungal cross-domain networks(OTU network - domain network - bipartite network)(细菌和真菌跨域网络分析-OTU网络-域网络-二分网络)

# 仅仅关注细菌和真菌之间的相关,不关注细菌内部和真菌内部相关
# Only focuses on the correlation between bacteria and fungi, not the correlation within bacteria and fungi
res1path <- "results/Other_microbiome_analysis/"

Envnetplot<- paste(res1path,"./16S_ITS_network",sep = "")
dir.create(Envnetplot)

ps16s = readRDS("./data/dataNEW/ps_16s.rds")
psITS = readRDS("./data/dataNEW/ps_ITS.rds")

#--细菌和真菌ps对象中的map文件要一样(The map files in the bacteria and fungi PS objects must be the same)
ps.merge <- ggClusterNet::merge16S_ITS(ps16s = ps16s,
                                       psITS = psITS,
                                       N16s = 300,
                                       NITS = 300
                                       )
ps.merge
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 600 taxa and 18 samples ]
sample_data() Sample Data:       [ 18 samples by 2 sample variables ]
tax_table()   Taxonomy Table:    [ 600 taxa by 8 taxonomic ranks ]
map =  phyloseq::sample_data(ps.merge)

# head(map)
map$Group = "one"
phyloseq::sample_data(ps.merge) <- map

data = NULL

library(sna)
library(igraph)
library(ggClusterNet)
library(phyloseq)
result <- corBionetwork(ps = ps.merge,
                        N = 0,
                        lab = data,
                        r.threshold = 0.6,
                        p.threshold = 0.05,
                        group = "Group",
                        # env = data1, 
                        # envGroup = Gru,
                        layout_net = "model_maptree2",
                        path = Envnetplot,
                        fill = "Phylum", 
                        size = "igraph.degree", 
                        scale = TRUE,
                        bio = TRUE, 
                        zipi = F, 
                        step = 100, 
                        width = 12,
                        label = TRUE,
                        height = 10,
                        big = TRUE,
                        select_layout = TRUE,
                        # layout_net = "model_maptree",
                        clu_method = "cluster_fast_greedy"
)
[1] "one"
[1] "1"
[1] "2"
[1] "3"
tem <- model_maptree(cor =result[[5]],
                     method = "cluster_fast_greedy",
                     seed = 12)
node_model = tem[[2]]
head(node_model)
           ID group degree
1 fun_ASV_311    14     72
2 fun_ASV_203    14     71
3 fun_ASV_246    14     70
4  fun_ASV_68    14     63
5 fun_ASV_268    14     62
6  fun_ASV_76    14     62
p = result[[1]]
p

# 全部样本网络参数比对
# Comparison of all sample network parameters
data = result[[2]]

plotname1 = paste(Envnetplot,"/network_all.pdf",sep = "")
ggsave(plotname1, p,width = 10,height = 8)
tablename <- paste(Envnetplot,"/co-occurrence_Grobel_net",".csv",sep = "")
write.csv(data,tablename)
tablename <- paste(Envnetplot,"/node_model_imformation",".csv",sep = "")
write.csv(node_model,tablename)

tablename <- paste(Envnetplot,"/nodeG_plot",".csv",sep = "")
write.csv(result[[4]],tablename)
tablename <- paste(Envnetplot,"/edge_plot",".csv",sep = "")
write.csv(result[[3]],tablename)
tablename <- paste(Envnetplot,"/cor_matrix",".csv",sep = "")
write.csv(result[[5]],tablename)

Bipartite network of bacteria and fungi at any taxonomic level(细菌真菌的任意水平二分网络)

library(tidyverse)
res1path <- "results/Other_microbiome_analysis/"

# res1path = "result_and_plot/Micro_and_other_index_16s/"
Envnetplot<- paste(res1path,"/16S_ITS_network_Genus",sep = "")
dir.create(Envnetplot)

ps16s = readRDS("./data/dataNEW/ps_16s.rds")
psITS = readRDS("./data/dataNEW/ps_ITS.rds")

#--细菌和真菌ps对象中的map文件要一样(The map files in the bacteria and fungi PS objects must be the same)
ps.merge <- merge16S_ITS(ps16s = ps16s,
                         psITS = psITS,
                         N16s = 300,
                         NITS = 300
)
ps.merge
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 600 taxa and 18 samples ]
sample_data() Sample Data:       [ 18 samples by 2 sample variables ]
tax_table()   Taxonomy Table:    [ 600 taxa by 8 taxonomic ranks ]
map =  phyloseq::sample_data(ps.merge)
# head(map)
map$Group = "one"
phyloseq::sample_data(ps.merge) <- map
tem.0 = ps.merge %>% tax_glom_wt(ranks = "Genus")
tax = tem.0 %>% vegan_tax() %>%
  as.data.frame()
head(tax)
               filed   Kingdom         Phylum               Class
Acidocella       bac  Bacteria Proteobacteria Alphaproteobacteria
Acrobeloides     fun   Metazoa       Nematoda         Chromadorea
Anteholosticha   fun Alveolata     Ciliophora        Spirotrichea
Aquicella        bac  Bacteria Proteobacteria Gammaproteobacteria
Arcopilus        fun     Fungi     Ascomycota     Sordariomycetes
Arenimonas       bac  Bacteria Proteobacteria Gammaproteobacteria
                          Order           Family          Genus
Acidocella     Rhodospirillales Acetobacteraceae     Acidocella
Acrobeloides         Rhabditida     Cephalobidae   Acrobeloides
Anteholosticha       Urostylida    Holostichidae Anteholosticha
Aquicella         Legionellales     Coxiellaceae      Aquicella
Arcopilus           Sordariales    Chaetomiaceae      Arcopilus
Arenimonas      Xanthomonadales Xanthomonadaceae     Arenimonas
data = NULL

library(sna)
library(igraph)
library(ggClusterNet)
library(phyloseq)
library(WGCNA)
result <- corBionetwork(ps = tem.0,
                        N = 0,
                        lab = data,
                        r.threshold = 0.6, 
                        p.threshold = 0.05,
                        group = "Group",
                        # env = data1, 
                        # envGroup = Gru,
                        # layout = "fruchtermanreingold",
                        path = Envnetplot,
                        fill = "Phylum", 
                        size = "igraph.degree", 
                        scale = TRUE, 
                        bio = TRUE, 
                        zipi = F, 
                        step = 100, 
                        width = 12,
                        label = TRUE,
                        height = 10,
                        big = TRUE,
                        select_layout = TRUE,
                        layout_net = "model_maptree2",
                        clu_method = "cluster_fast_greedy"
                        )
[1] "one"
[1] "1"
[1] "2"
[1] "3"
tem <- model_maptree(cor =result[[5]],
                     method = "cluster_fast_greedy",
                     seed = 12)
node_model = tem[[2]]
head(node_model)
           ID group degree
1  Microascus     8     15
2 Hydrogonium     4      8
3   Conlarium     3      7
4 Mortierella     8      6
5   Arcopilus     6      5
6    Humicola     6      5
otu = tem.0 %>% vegan_otu() %>%
  as.data.frame()
node_model = node_model[match(colnames(otu),node_model$ID),]

MEList = moduleEigengenes(otu, colors = node_model$group)
MEs = MEList$eigengenes

nGenes = ncol(otu)
nSamples = nrow(otu)
moduleTraitCor = cor(MEs, envRDA, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

#sizeGrWindow(10,6)
pdf(file=paste(Envnetplot,"/","Module-env_relationships.pdf",sep = ""),width=10,height=6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                   signif(moduleTraitPvalue, 1), ")", sep = "")

dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(envRDA),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = greenWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))
dev.off()
png 
  2 
p = result[[1]]
p

# 全部样本网络参数比对
# Comparison of all sample network parameters
data = result[[2]]

plotname1 = paste(Envnetplot,"/network_all.pdf",sep = "")
ggsave(plotname1, p,width = 10,height = 8)
tablename <- paste(Envnetplot,"/co-occurrence_Grobel_net",".csv",sep = "")
write.csv(data,tablename)

tablename <- paste(Envnetplot,"/nodeG_plot",".csv",sep = "")
write.csv(result[[4]],tablename)
tablename <- paste(Envnetplot,"/edge_plot",".csv",sep = "")
write.csv(result[[3]],tablename)
tablename <- paste(Envnetplot,"/cor_matrix",".csv",sep = "")
write.csv(result[[5]],tablename)

Neutral model(中性模型)

library(picante)
library(ape)
library(vegan)
library(FSA)
library(eulerr)
library(grid)
library(gridExtra)
require(minpack.lm)
require(Hmisc)
require(stats4)
library(parallel)
library(ggClusterNet)
library(phyloseq)

res1path = "results/Other_microbiome_analysis/"
phypath = paste(res1path,"./Phylogenetic_analyse_spacies/",sep = "")
dir.create(phypath)

neutralModel = function(otu = NULL,
                        tax = NULL,
                        map = NULL,
                        tree = NULL,
                        ps = NULL,
                        group  = "Group",
                        ncol = 3,
                        nrow  = 1
                        ){
  # 抽平,默认使用最小序列抽平
  # Leveling, the default is to use the minimum sequence leveling
  ps = inputMicro(otu,tax,map,tree,ps,group  = group)
  ps
  set.seed(72)  #设置随机种子,保证结果可重复(Set a random seed to ensure repeatable results)
  psrare = rarefy_even_depth(ps)
  # 标准化
  ps.norm = transform_sample_counts(psrare, function(x) x/sum(x))
  #------------------------------------------开始计算中性模型(Start calculation of neutral model)----------------------------------------------------------
  map = as.data.frame(sample_data(psrare))
  aa = levels(map$Group)
  aa
  map$ID = row.names(map)
  plots = list()
  dat1 = list()
  dat2 = list()
  i =1
  for (i in 1:length(aa)) {
    maps<- dplyr::filter(as.tibble(map),Group %in%aa[i])
    maps = as.data.frame(maps)
    row.names(maps) = maps$ID
    ps_sub = psrare
    sample_data(ps_sub) =maps ;ps_sub
    # 提取OTU表格(Extract OTU table)
    OTU.table = t(otu_table(ps_sub))
    head(OTU.table )
    # 将整个群落看做一个整体,计算每个样本的序列数,并求取均值
    # Calculate the number of individuals in the meta community (Average read depth)
    N <- mean(apply(OTU.table, 1, sum))
    ##计算每个OTU的的平均序列数 
    # Calculate the average relative abundance of each taxa across communities
    p.m <- apply(OTU.table, 2, mean)
    # 去除OTU序列数为0的OTU
    # Remove OTUs with OTU sequence number of 0
    p.m <- p.m[p.m != 0]
    p <- p.m/N
    p.df = data.frame(p) %>%
      rownames_to_column(var="OTU")
    # Calculate the occurrence frequency of each taxa
    OTU.table.bi <- 1*(OTU.table>0)
    freq.table <- apply(OTU.table.bi, 2, mean)
    freq.table <- freq.table[freq.table != 0]
    freq.df = data.frame(OTU=names(freq.table), freq=freq.table)
    #Combine
    C <- inner_join(p.df,freq.df, by="OTU") %>%
      arrange(p)
    # Remove rows with any zero (absent in either source pool or local communities). You already did this, but just to make sure we will do it again.
    C.no0 <- C %>%
      filter(freq != 0, p != 0)
    #Calculate the limit of detection
    d <- 1/N
    ##Fit model parameter m (or Nm) using Non-linear least squares (NLS)
    p.list <- C.no0$p
    freq.list <- C.no0$freq
    m.fit <- nlsLM(freq.list ~ pbeta(d, N*m*p.list, N*m*(1-p.list), lower.tail=FALSE), start=list(m=0.1))
    m.ci <- confint(m.fit, 'm', level=0.95)
    m.sum <- summary(m.fit)
    m.coef = coef(m.fit)
    freq.pred <- pbeta(d, N*coef(m.fit)*p.list, N*coef(m.fit)*(1-p.list), lower.tail=FALSE)
    Rsqr <- 1 - (sum((freq.list - freq.pred)^2))/(sum((freq.list - mean(freq.list))^2))
    # Get table of model fit stats
    fitstats <- data.frame(m=m.coef, m.low.ci=m.ci[1], m.up.ci=m.ci[2],
                           Rsqr=Rsqr, p.value=m.sum$parameters[4], N=N,
                           Samples=nrow(OTU.table), Richness=length(p.list),
                           Detect=d)
    # Get confidence interval for predictions
    freq.pred.ci <- binconf(freq.pred*nrow(OTU.table), nrow(OTU.table), alpha=0.05, method="wilson", return.df=TRUE)
    # Get table of predictions
    pred.df <- data.frame(metacomm_RA=p.list, frequency=freq.pred,
                          frequency_lowerCI=freq.pred.ci[,2],
                          frequency_upperCI=freq.pred.ci[,3]) %>%
      unique()
    # Get table of observed occupancy and abundance
    obs.df = C.no0 %>%
      dplyr::rename(metacomm_RA = p, frequency=freq)
    head(obs.df)
    p = ggplot(data=obs.df) +
      geom_point(data=obs.df, aes(x=log10(metacomm_RA), y=frequency),
                 alpha=.3, size=2, color="#8DD3C7") +
      geom_line(data=pred.df, aes(x=log10(metacomm_RA), y=frequency), color="#FFFFB3") +
      geom_line(data=pred.df, aes(x=log10(metacomm_RA), y=frequency_lowerCI), linetype=2, color="#FFFFB3") +
      geom_line(data=pred.df, aes(x=log10(metacomm_RA), y=frequency_upperCI), linetype=2, color="#FFFFB3") +
      # geom_text(data=fitstats, aes(label = paste("R^2 == ", round(Rsqr, 3))),
      #           x=1, y=0.75, size=4, parse=TRUE) +
      # geom_text(data=fitstats, aes(label = paste("italic(m) ==", round(m, 3))),
      #           x=-1, y=0.85, size=4, parse=TRUE) +
      labs(x="Log10 abundance in\nmetacommunity", y="Frequency detected",title = paste(aa[i],paste("R^2 == ", round(fitstats$Rsqr, 3)),paste("italic(m) ==", round(fitstats$m, 3)))) +
      theme_bw() +
      theme(axis.line = element_line(color="black"),
            legend.position = "none",
            axis.title = element_text(size=14),
            axis.text = element_text(size=12))
    p
    plots[[aa[i]]] = p
    dat1[[aa[i]]] = obs.df
    dat2[[aa[i]]] = pred.df
  }
  # plots$ABCD
  # library(ggpubr)
  # nrow=2,,ncol=4
  p  = ggpubr::ggarrange(plotlist = plots,common.legend = TRUE, legend="right",ncol = ncol,nrow = nrow)
  p
  return(list(p,plots,dat1,dat2))
}

ps = readRDS("./data/dataNEW/ps_16s.rds")

result = neutralModel(ps = ps,group  = "Group",ncol = 3)

#--合并图表(Merge charts)
p1 =  result[[1]]
p1

FileName <- paste(phypath,"./1_neutral_modelCul", ".pdf", sep = "")
ggsave(FileName, p1,width = 12,height = 4)
FileName <- paste(phypath,"./1_neutral_modelCul", ".png", sep = "")
ggsave(FileName, p1,width = 12,height = 4)


#--系统发育信号(Phylogenetic signal)
phyloSignal = function(otu = NULL,
                       tax = NULL,
                       map = NULL,
                       tree = NULL ,
                       ps = NULL,
                       env = env,
                       group  = "Group",
                       path = "./"){
  # 抽平,默认使用最小序列抽平
  # Leveling, the default is to use the minimum sequence leveling
  ps = inputMicro(otu,tax,map,tree,ps,group  = group)
  ps
  set.seed(72)
  psrare = rarefy_even_depth(ps)
  # 标准化(Standardized)
  ps.norm = transform_sample_counts(psrare, function(x) x/sum(x))
  map = as.data.frame(sample_data(psrare))
  mapE =merge(map,env,by = "row.names",all= TRUE)
  row.names(mapE) = mapE$Row.names
  mapE$Row.names = NULL
  mapE$ID = row.names(mapE)
  sample_data(ps.norm) = mapE
  aa = levels(mapE$Group)
  dir.create(path)
  #----------分组计算门特尔相关,将结果保存,因为计算时间很长,只需计算一个就好了#-------
  # Calculate the Mentel correlation in groups and save the results. Since the calculation time is very long, only one calculation is needed.
  eco = "Endosp."
  for (eco in as.character(unique(mapE$Group))){
    # Subset data
    print(paste("Now running", eco))
    # sub.physeq = phyloseq::subset_samples(ps.norm , Group == eco)
    sub.physeq = ps.norm
    otu = as.data.frame(vegan_otu(ps.norm))
    head(otu)
    map = as.data.frame(sample_data(ps.norm))
    mapsub <- map[map$Group == eco,]
    sample_data(sub.physeq) = mapsub
    # Remove OTUs not found in at least 3 samples
    OTU.table = otu_table(sub.physeq)
    OTU.table[OTU.table > 0] = 1
    OTU.freq = rowSums(OTU.table)
    OTU.freq = OTU.freq[OTU.freq > 2]
    sub.physeq = prune_taxa(names(OTU.freq), sub.physeq)
    sub.physeq
    # get phylogenetic distances
    tree = phy_tree(sub.physeq)
    phylo.dist = cophenetic(tree)
    sample_OTUs = tree$tip.label
    sam.phylo.dist = phylo.dist[sample_OTUs, sample_OTUs]
    sam.phylo.dist[upper.tri(sam.phylo.dist, diag=TRUE)] = NA
    # Generate dataframe of niche preference for pH, SOC and CN
    # site.chem.mat =  data.frame(sample_data(sub.physeq)) %>%
    #   # mutate(CN = percent_C / percent_N) %>%
    #   dplyr::select(ID, colnames(env))
    site.chem.mat =  env[row.names(env) %in% row.names(mapsub),]
    # rownames(site.chem.mat) = site.chem.mat$ID
    # site.chem.mat$ID = NULL
    site.chem.mat = as.matrix(site.chem.mat)
    otu.table = t(otu_table(sub.physeq))
    # head(otu.table)
    match(row.names(otu.table),row.names(site.chem.mat))
    OTU.niche = wascores(site.chem.mat, otu.table)
    OTU.niche.df = data.frame(OTU.niche)
    head( OTU.niche.df)
    # i =1
    for (i in 1:dim(OTU.niche.df)[2]) {
      pH.pref = OTU.niche.df[[i]]
      names(pH.pref) = rownames(OTU.niche.df)
      pH.dist = as.matrix(dist(pH.pref), labels=TRUE)
      sam.pH.dist = pH.dist[sample_OTUs, sample_OTUs]
      sam.pH.dist[upper.tri(sam.pH.dist, diag=TRUE)] = NA
      sam.pH.crlg = mantel.correlog(sam.pH.dist, sam.phylo.dist)
      # ?mantel.correlog
      filename = paste(path,eco,colnames(OTU.niche.df[i]), "_crlg.rds", sep="_")
      saveRDS(sam.pH.crlg, file=filename)
    }
  }
}

phySigPlot = function(otu = NULL,
                      tax = NULL,
                      map = NULL,
                      tree = NULL,
                      ps = NULL,
                      group  = "Group",
                      env = env,
                      path = "./"){
  # 抽平,默认使用最小序列抽平(Leveling, the default is to use the minimum sequence leveling)
  ps = inputMicro(otu,tax,map,tree,ps,group  = group)
  ps
  mapE = as.data.frame(sample_data(ps))
  for (eco in levels(mapE$Group)) {
      # eco = "KO"
      # i = 1
    for (i in 1:length(colnames(env))) {
      ag.pH.crlg = data.frame(readRDS(file=paste(path,eco,colnames(env[i]), "_crlg.rds", sep="_"))$mantel.res) %>%
        mutate(Group = eco, property = colnames(env)[i])
      if (i == 1) {
        data = ag.pH.crlg
      }
      if (i != 1) {
        data = rbind(data,ag.pH.crlg )
      }
    }
    if (eco == levels(mapE$Group)[1]) {
      data2 =  data
    }
    if (eco != levels(mapE$Group)[1]) {
      data2 = rbind(data2, data)
    }
  }
  dim(data2)
  eco.crlg = data2 %>%
    mutate(sig = ifelse(Pr.corrected. <= 0.05, "significant", "non-significant")) %>%
    filter(!(is.na(Pr.corrected.)))
  eco.crlg$Group= factor(eco.crlg$Group)
  p = ggplot(data=eco.crlg, aes(x=class.index, y=Mantel.cor)) +
    geom_point(data=eco.crlg[eco.crlg$sig=="significant",], color = "black", size=2, shape=16) +
    geom_point(data=eco.crlg[eco.crlg$sig=="non-significant",], color = "black",size=2, shape=1) +
    geom_line(data=eco.crlg, aes(color=property)) +
    geom_hline(yintercept = 0, linetype=2) +
    labs(x = "Phylogenetic distance class", y="Mantel correlation", color="property") +
    # facet_grid(~Group)
    facet_wrap(~Group,scales="free_y",ncol  = 4)
  return(list(p,eco.crlg,data2))
}

env = read.csv("./data/dataNEW/env.csv")
head(env)
       ID   pH  SOC   TN NH4.N NO3.N   AP   AK    CN   LA Height    TG   RGR
1 sample1 4.45 7.96 0.70  6.86  9.45 36.2 74.2 11.37 35.6     63 0.609 0.863
2 sample2 4.55 9.12 0.89  5.78 10.71 49.5 68.5 10.24 46.8     67 0.625 0.976
3 sample3 4.47 7.58 0.92  6.27 11.85 50.8 68.9  8.24 44.6     48 0.886 1.340
4 sample4 4.63 8.66 0.65  5.49 10.22 55.7 58.7 13.28 39.7     53 0.644 0.801
5 sample5 4.38 9.59 0.74  6.55  9.84 44.4 60.8 13.04 50.7     55 0.591 0.665
6 sample6 4.52 9.88 0.63  5.66  9.73 39.9 70.4 15.74 42.5     55 0.751 1.145
    LB  SB  RB   R.S  FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2  LRL3  LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7   19   41   96 31.7  132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5   30   47   56 34.5  145  84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5   26   41  110 26.4  108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9   22   33   84 25.3  153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9   28   51  103 28.6  132  77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4   25   42  131 26.8  220 211.1 0.241
    LRD2  LRD3 MaxO TLRN TLRL   BI   MID
1 0.0655 0.058    4  156  296 1.54 0.619
2 0.0700 0.069    5  133  263 1.80 1.780
3 0.0660 0.066    6  177  316 2.67 1.373
4 0.0940 0.086    5  139  335 1.37 0.811
5 0.0680 0.071    7  182  238 1.88 0.700
6 0.0700 0.068    4  198  457 1.73 0.602
envRDA = env
head(env)
       ID   pH  SOC   TN NH4.N NO3.N   AP   AK    CN   LA Height    TG   RGR
1 sample1 4.45 7.96 0.70  6.86  9.45 36.2 74.2 11.37 35.6     63 0.609 0.863
2 sample2 4.55 9.12 0.89  5.78 10.71 49.5 68.5 10.24 46.8     67 0.625 0.976
3 sample3 4.47 7.58 0.92  6.27 11.85 50.8 68.9  8.24 44.6     48 0.886 1.340
4 sample4 4.63 8.66 0.65  5.49 10.22 55.7 58.7 13.28 39.7     53 0.644 0.801
5 sample5 4.38 9.59 0.74  6.55  9.84 44.4 60.8 13.04 50.7     55 0.591 0.665
6 sample6 4.52 9.88 0.63  5.66  9.73 39.9 70.4 15.74 42.5     55 0.751 1.145
    LB  SB  RB   R.S  FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2  LRL3  LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7   19   41   96 31.7  132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5   30   47   56 34.5  145  84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5   26   41  110 26.4  108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9   22   33   84 25.3  153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9   28   51  103 28.6  132  77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4   25   42  131 26.8  220 211.1 0.241
    LRD2  LRD3 MaxO TLRN TLRL   BI   MID
1 0.0655 0.058    4  156  296 1.54 0.619
2 0.0700 0.069    5  133  263 1.80 1.780
3 0.0660 0.066    6  177  316 2.67 1.373
4 0.0940 0.086    5  139  335 1.37 0.811
5 0.0680 0.071    7  182  238 1.88 0.700
6 0.0700 0.068    4  198  457 1.73 0.602
row.names(envRDA) = env$ID
envRDA$ID = NULL
head(envRDA)
          pH  SOC   TN NH4.N NO3.N   AP   AK    CN   LA Height    TG   RGR   LB
sample1 4.45 7.96 0.70  6.86  9.45 36.2 74.2 11.37 35.6     63 0.609 0.863 48.4
sample2 4.55 9.12 0.89  5.78 10.71 49.5 68.5 10.24 46.8     67 0.625 0.976 70.1
sample3 4.47 7.58 0.92  6.27 11.85 50.8 68.9  8.24 44.6     48 0.886 1.340 61.1
sample4 4.63 8.66 0.65  5.49 10.22 55.7 58.7 13.28 39.7     53 0.644 0.801 74.7
sample5 4.38 9.59 0.74  6.55  9.84 44.4 60.8 13.04 50.7     55 0.591 0.665 58.2
sample6 4.52 9.88 0.63  5.66  9.73 39.9 70.4 15.74 42.5     55 0.751 1.145 70.4
         SB  RB   R.S  FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2  LRL3  LRD1
sample1 275 315 0.975 91.4 35.3 29.4 26.7   19   41   96 31.7  132 131.9 0.188
sample2 213 380 1.340 93.1 36.1 29.5 27.5   30   47   56 34.5  145  84.2 0.126
sample3 253 351 1.117 89.7 30.0 31.2 28.5   26   41  110 26.4  108 181.3 0.179
sample4 220 309 1.048 96.9 38.5 30.6 27.9   22   33   84 25.3  153 156.6 0.208
sample5 309 395 1.078 89.4 32.6 29.8 26.9   28   51  103 28.6  132  77.3 0.135
sample6 255 356 1.095 92.3 33.6 30.3 28.4   25   42  131 26.8  220 211.1 0.241
          LRD2  LRD3 MaxO TLRN TLRL   BI   MID
sample1 0.0655 0.058    4  156  296 1.54 0.619
sample2 0.0700 0.069    5  133  263 1.80 1.780
sample3 0.0660 0.066    6  177  316 2.67 1.373
sample4 0.0940 0.086    5  139  335 1.37 0.811
sample5 0.0680 0.071    7  182  238 1.88 0.700
sample6 0.0700 0.068    4  198  457 1.73 0.602
phypath2 = paste(phypath,"/phyloSignal/",sep = "")
dir.create(phypath)

phyloSignal(ps = ps %>% filter_OTU_ps(400),
            group  = "Group",
            env = envRDA[,2:3],
            path = phypath2)
[1] "Now running Group1"
[1] "Now running Group2"
[1] "Now running Group3"
result = phySigPlot(ps = ps,group  = "Group",env = envRDA[,2:3],path = phypath2)

#提取图片(Extract images)
p2 = result[[1]] + mytheme1
p2

#-提取作图数据(Extracting mapping data)
data = result[[2]]
head(data)
       class.index n.dist Mantel.cor Pr.Mantel. Pr.corrected.  Group property
D.cl.1      0.0681    713    0.01356      0.009         0.009 Group1      SOC
D.cl.2      0.1954   1021    0.00614      0.271         0.271 Group1      SOC
D.cl.3      0.3227   1915    0.01013      0.159         0.318 Group1      SOC
D.cl.4      0.4499   2073    0.00511      0.298         0.542 Group1      SOC
D.cl.5      0.5772   4801    0.01436      0.185         0.636 Group1      SOC
D.cl.6      0.7045  10999    0.04456      0.013         0.065 Group1      SOC
                   sig
D.cl.1     significant
D.cl.2 non-significant
D.cl.3 non-significant
D.cl.4 non-significant
D.cl.5 non-significant
D.cl.6 non-significant
FileName <- paste(phypath,"2_phySigPlot", ".pdf", sep = "")
ggsave(FileName, p2,width = 15,height = 6)
FileName <- paste(phypath,"2_phySigPlot", ".csv", sep = "")
write.csv(data,FileName)

Null model calculation (计算零模型)

nullModel <- function(otu = NULL,
         tax = NULL,
         map = NULL,
         tree = NULL ,
         ps = NULL,
         group  = "Group",
         dist.method =  "bray",
         gamma.method = "total",
         transfer = "none",
         null.model = "ecosphere"){
  ps = inputMicro(otu,tax,map,tree,ps,group  = group)
  map = as.data.frame(sample_data(ps))
  grp1 = unique(map$Group)
  grp=list()
  ### 制作分组列表(Make a group list)
  for (i in 1:length(grp1)) {
    grp[[i]]=rownames(map)[which(map$Group==grp1[i])]
  }
  names(grp) = grp1
  report = c()
  dat4anova = c()
  grp4anova = c()
  report.ES = c()
  report.SES = c()
  # x=17
  otu = as.data.frame(t(vegan_otu(ps)))
  otu = as.matrix(otu)
  for(x in c(1:length(grp))){  #
    #print(paste("Group",x))
    dataCK1 = otu[,grp[[x]]]
    ##delete empty rows
    if(gamma.method == "group"){
      rsum1 = rowSums(dataCK1)
      tempCK1 = which(rsum1==0)
      if(length(tempCK1)!=0) {dataCK1 = dataCK1[-tempCK1,]}
    }
    # 分组,对一组计算距离(Grouping, calculating the distance for a group)
    beta.dist = vegdist(t(dataCK1),method = dist.method)
    # 转化为相似性距离(Convert to similarity distance)
    similarity.ob = 1 - beta.dist
    #similarity.ob.sd = sd(1-beta.dist, na.rm=TRUE)
    # 统计有多少个OTU(Count how many OTUs there are)
    gamma = nrow(dataCK1)
    # 统计每个样本的OTU数量(Count the number of OTUs in each sample)
    alpha = colSums(dataCK1>0)
    # OTU求和(OTU summation)
    if(gamma.method == "group"){
      occur = apply(dataCK1, MARGIN=1, FUN=sum)
    }else{
      occur = apply(otu, MARGIN=1, FUN=sum)  #otu[valid.row,]
    }
    #print(paste(similarity.ob, similarity.ob.sd))
    r = 100
    # 构建样本矩阵,空矩阵(Construct sample matrix, empty matrix)
    similarity.pm = matrix(0, nrow=ncol(dataCK1), ncol=ncol(dataCK1))
    similarity.pm = as.dist(similarity.pm)
    # i = 1
    for(i in 1:r){
      # print(i)
      # 构造OTU矩阵孔阵(Construct OTU matrix hole array)
      PRM1 = matrix(0, ncol= ncol(dataCK1), nrow = nrow(dataCK1))
      if(null.model == "ecosphere"){
        # j = 1
        for(j in 1:ncol(dataCK1)){
          # 提取该样本otu大于0的全部otu
          # Extract all otu of this sample whose otu is greater than 0
          aa = dataCK1[dataCK1[,j]>0,j]
          PRM1[sample(1:gamma, alpha[j], replace=FALSE, prob=occur), j] = aa
        }
      }else if(null.model == "ecosim"){
        PRM1 = randomizeMatrix(dataCK1, null.model="independentswap")
      }else if(null.model == "frequency"){
        PRM1 = randomizeMatrix(dataCK1, null.model="frequency")
      }
      # 计算抽的的矩阵的距离(Calculate the distance of the extracted matrix)
      dist_pm = vegdist(t(PRM1),method = dist.method)
      # 将距离转化相似度放到之前构建的空阵中(Put the distance conversion similarity into the empty array constructed previously)
      similarity.pm = similarity.pm + (1- dist_pm)
    }
    similarity.pm = similarity.pm/r
    #plot(density(similarity.pm[i,]))
    normality = shapiro.test(similarity.pm)#正态性检测(Normality test)
    nor.p = normality$p.value
    ttest = t.test(similarity.pm, similarity.ob, alternative="two.sided", paired = TRUE, conf.level = 0.95)
    tt.p = ttest$p.value
    conf.int = ttest$conf.int
    pm.mean = mean(similarity.pm)
    pm.sd = sd(similarity.pm)
    ES = log(similarity.ob) - log(similarity.pm)
    effect.size = mean(ES)
    effect.size.sd = sd(ES)
    SES = (similarity.ob - similarity.pm)/pm.sd
    sd.effect.size = mean(SES)
    sd.effect.size.sd = sd(SES)
    ratio = 1 - similarity.pm / similarity.ob
    ratio.mean = mean(ratio)
    ratio.sd = sd(ratio)
    dat4anova = c(dat4anova, as.vector(ratio))
    grp4anova = c(grp4anova, rep(names(grp)[x], length(ratio)))
    conf.int.str = paste("[",paste(signif(conf.int,digits=3),collapse="~"),"]",sep="")
    report = rbind(report, c(mean(similarity.ob),sd(similarity.ob), pm.mean,  pm.sd, conf.int.str, nor.p, tt.p , effect.size, effect.size.sd, sd.effect.size, sd.effect.size.sd, ratio.mean, ratio.sd))
    report.ES = c(report.ES, effect.size)
    report.SES = c(report.SES, sd.effect.size)
  }
  rownames(report) = grp1
  colnames(report) = c("Mean of observed similarity", "Standard deviation of observed similarity",
                       "Mean of permutated similarity", "Standard deviation of permutated similarity",
                       "95% Conf int of perm similarity", "Normality test (p) on Perm similarity",
                       "T test on Ob and Perm similarity", "Effect size (ES)", "SD of ES",
                       "Standardized effect size (SES)", "SD of SES", "Mean of Ratio", "SD of Ratio")
  head(report)
  rep = t(report)
  head(rep)
  # 这个统计量代表不同群落之间是否有差异(This statistic represents whether there are differences between different groups)
  ##将零模型的统计检验结果保存到文件中(Save the results of the statistical test of the null model to a file.)
  if (length(unique(grp4anova)) > 1) {
    aov.re = aov(dat4anova ~ grp4anova)
  } else {
    aov.re = NULL
  }
  #---------------将比例保存起来备用(Save the ratio for later use)
  ratio = data.frame(ratio = dat4anova,group = grp4anova)
  return(list(rep,ratio,aov.re))
}

ps = readRDS("./data/dataNEW/ps_16s.rds")

psphy = filter_taxa(ps, function(x) sum(x ) > 1000 , TRUE);psphy
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 597 taxa and 18 samples ]
sample_data() Sample Data:       [ 18 samples by 2 sample variables ]
tax_table()   Taxonomy Table:    [ 597 taxa by 7 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 597 tips and 595 internal nodes ]
refseq()      DNAStringSet:      [ 597 reference sequences ]
result <- nullModel(ps = psphy,
                    group="Group",
                    dist.method =  "bray",
                    gamma.method = "total",
                    transfer = "none",
                    null.model = "ecosphere"
                    )

#--分组零模型运行结果(Results of running the grouped null model)
nullModeltab <- result[[1]]
# 比例(Percentage)
ratiotab <- result[[2]]
#-统计量统计差异(Statistical Difference)
aovtab <- result[[3]]
FileName <- paste(phypath,"3_nullModeltab", ".csv", sep = "")
write.csv(nullModeltab,FileName)
FileName <- paste(phypath,"3_ratiotab", ".csv", sep = "")
write.csv(ratiotab,FileName)

βNTI-nearest taxon index(最近种间亲缘关系指数)

conflicts_prefer(base::attr)
bNTICul = function(otu = NULL,tax = NULL,map = NULL,tree = NULL ,ps = NULL,group  = "Group",num = 99,thread = 1){
  ps = inputMicro(otu,tax,map,tree,ps,group  = group)
  ps
  ps_sub <- ps
  # tree = phy_tree(ps)
  # tree
  #-------------调整map文件(Adjust the map file)-----------------------------------------------------------------
  #添加一个ID列(Add an ID column)
  map = as.data.frame(sample_data(ps_sub))
  map$ID = row.names(map)
  sample_data(ps) = map
  #-----------准备OTU表格(Prepare OTU table)---------------------抽平-不设置抽平条数,默认按照最小序列数数目抽平
  #Leveling - Do not set the number of levels, the default is to level according to the minimum number of sequences
  set.seed(72)  # setting seed for reproducibility
  psrare = rarefy_even_depth(ps_sub)
  #检查序列数量(Check the number of sequences)
  sample_sums(psrare)
  # 标准化数据(Standardized data)
  ps.norm = transform_sample_counts(psrare, function(x) x/sum(x))
  # 计算βMNTD对每个随机零模型群落(Calculate βMNTD for each random null model population)
  bMNTD_null_func <- function(i, OTU.table, tree){
    tree$tip.label = sample(tree$tip.label)
    bMNTD_s = comdistnt(OTU.table, cophenetic(tree), abundance.weighted = TRUE)
    A <- attr(bMNTD_s, "Size")
    B <- if (is.null(attr(bMNTD_s, "Labels"))) sequence(A) else attr(bMNTD_s, "Labels")
    if (isTRUE(attr(bMNTD_s, "Diag"))) attr(bMNTD_s, "Diag") <- FALSE
    if (isTRUE(attr(bMNTD_s, "Upper"))) attr(bMNTD_s, "Upper") <- FALSE
    bMNTD_s.df = data.frame(Sample_1 = B[unlist(lapply(sequence(A)[-1], function(x) x:A))],
                            Sample_2 = rep(B[-length(B)], (length(B)-1):1),
                            bMNTD = as.vector(bMNTD_s),
                            rep=i)
    return(bMNTD_s.df)
  }
  # 计算βNTI(Calculate βNTI)
  Phylo_turnover <- function(physeq, reps, nproc){
    # Extract OTU table
    OTU.table = t(otu_table(physeq))
    # Extract phylogenetic tree
    tree = phy_tree(physeq)
    # Get βMNTD between all communities
    bMNTD_o = comdistnt(OTU.table, cophenetic(tree), abundance.weighted = TRUE)
    A <- attr(bMNTD_o, "Size")
    B <- if (is.null(attr(bMNTD_o, "Labels"))) sequence(A) else attr(bMNTD_o, "Labels")
    if (isTRUE(attr(bMNTD_o, "Diag"))) attr(bMNTD_o, "Diag") <- FALSE
    if (isTRUE(attr(bMNTD_o, "Upper"))) attr(bMNTD_o, "Upper") <- FALSE
    bMNTD_o.df = data.frame(Sample_1 = B[unlist(lapply(sequence(A)[-1], function(x) x:A))],
                            Sample_2 = rep(B[-length(B)], (length(B)-1):1),
                            bMNTD = as.vector(bMNTD_o))
    # Get βMNTD for randomized null communities
    rep.list = seq(1, reps)
    bMNTD_s.df.list = mclapply(rep.list, bMNTD_null_func, OTU.table=OTU.table, tree=tree, mc.cores=nproc)
    # Combine all data together and calculate βNTI for each sample pair
    bMNTD_s.df <- do.call("rbind", bMNTD_s.df.list)
    bMNTD_s.means.df = bMNTD_s.df %>%
      group_by(Sample_1, Sample_2) %>%
      dplyr::summarize(mean_bMNTD = mean(bMNTD),
                       sd_bMNTD = sd(bMNTD))
    bMNTD_o.df = inner_join(bMNTD_o.df, bMNTD_s.means.df, by=c("Sample_1", "Sample_2")) %>%
      mutate(bNTI = (bMNTD - mean_bMNTD)/sd_bMNTD)
    return(bMNTD_o.df)
  }
  #========这里一把单核就真实数据而言需要超过10个小时,跑999次,所以需要多核
  # Here, a single core needs more than 10 hours for real data, running 999 times, so multiple cores are needed
  # 计算bnti,这里可以设置线程数量,是第三个参数,我们在linux下面可以设置,30个线程
  # Calculate bnti. Here you can set the number of threads, which is the third parameter. We can set it under Linux to 30 threads.
  # 第二个参数设置迭代数量,这里文献一般999
  # The second parameter sets the number of iterations, which is usually 999.
  bNTI = Phylo_turnover(psrare, num, thread)
  return(list(bNTI))
}

ps = readRDS("./data/dataNEW/ps_16s.rds")
psphy = filter_taxa(ps, function(x) sum(x ) > 1000 , TRUE);psphy
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 597 taxa and 18 samples ]
sample_data() Sample Data:       [ 18 samples by 2 sample variables ]
tax_table()   Taxonomy Table:    [ 597 taxa by 7 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 597 tips and 595 internal nodes ]
refseq()      DNAStringSet:      [ 597 reference sequences ]
result = bNTICul(ps = psphy,group  = "Group",num = 100,thread = 1)
bNTI = result[[1]]
head(bNTI)
  Sample_1 Sample_2    bMNTD mean_bMNTD sd_bMNTD  bNTI
1 sample10  sample1 0.004675   0.003640 0.001402 0.738
2 sample11  sample1 0.007456   0.006143 0.002386 0.550
3 sample12  sample1 0.002150   0.001382 0.000688 1.117
4 sample13  sample1 0.002822   0.001092 0.000873 1.980
5 sample14  sample1 0.001703   0.001007 0.000783 0.889
6 sample15  sample1 0.000923   0.000493 0.000330 1.304
filename = paste(phypath,"/4_bNTI.csv",sep = "")
write.csv(bNTI, filename)

RCbray calculation(计算RCbray)

RCbary = function(otu = NULL,tax = NULL,map = NULL,tree = NULL ,ps = NULL,group  = "Group",num = 99,thread = 1){
  ps_sub <- ps
  #----------------整理map文件(Organize map files)
  map = as.data.frame(sample_data(ps_sub))
  map$ID = row.names(map)
  sample_data(ps) = map
  #-------------------准备OTU表格(Prepare OTU table)
  #-----------------抽平-不设置抽平条数,默认按照最小序列数数目抽平
  #Leveling - Do not set the number of levels, the default is to level according to the minimum number of sequences
  set.seed(72)  # setting seed for reproducibility
  psrare = rarefy_even_depth(ps_sub )
  #检查序列数量(Check the number of sequences)
  sample_sums(psrare)
  # 标准化数据(Standardized data)
  ps.norm = transform_sample_counts(psrare, function(x) x/sum(x))
  #--------------两个函数(Two functions)
  # 对模拟群落计算距离(Calculating distances for simulated communities)
  RCbray_null_func <- function(i, freq.abd.df, alpha1, alpha2, N){
    # Get simulated communities and distance
    ## initally select OTUs weighted by their frequency. The number of OTUs selected should equal the richness of the samples.
    simcom1 = data.frame(table(sample(freq.abd.df$OTU, size=alpha1, replace=FALSE, prob=freq.abd.df$freq)), stringsAsFactors = F)
    colnames(simcom1) = c("OTU","simcom1")
    simcom1$OTU = as.character(simcom1$OTU)
    simcom1 = inner_join(simcom1, freq.abd.df, by="OTU")
    simcom2 = data.frame(table(sample(freq.abd.df$OTU, size=alpha2, replace=FALSE, prob=freq.abd.df$freq)), stringsAsFactors = F)
    colnames(simcom2) = c("OTU","simcom2")
    simcom2$OTU = as.character(simcom2$OTU)
    simcom2 = inner_join(simcom2, freq.abd.df, by="OTU")
    ## Now recruit OTUs based on their abundance in the metacommunity
    simcom1.abd = data.frame(table(sample(simcom1$OTU, size=N-alpha1, replace=TRUE, prob=simcom1$p)), stringsAsFactors = F)
    colnames(simcom1.abd) = c("OTU","simcom1.abd")
    simcom1.abd$OTU = as.character(simcom1.abd$OTU)
    simcom1 = full_join(simcom1, simcom1.abd, by="OTU") %>%
      mutate(simcom1.abd = ifelse(is.na(simcom1.abd), 1, simcom1.abd)) %>%
      select(OTU, simcom1.abd)
    simcom2.abd = data.frame(table(sample(simcom2$OTU, size=N-alpha2, replace=TRUE, prob=simcom2$p)), stringsAsFactors = F)
    colnames(simcom2.abd) = c("OTU","simcom2.abd")
    simcom2.abd$OTU = as.character(simcom2.abd$OTU)
    simcom2 = full_join(simcom2, simcom2.abd, by="OTU") %>%
      mutate(simcom2.abd = ifelse(is.na(simcom2.abd), 1, simcom2.abd)) %>%
      select(OTU, simcom2.abd)
    simcom = full_join(simcom1, simcom2, by="OTU")
    simcom[is.na(simcom)] = 0
    rownames(simcom) = simcom$OTU
    simcom$OTU = NULL
    null.dist = vegdist(t(simcom), method="bray")[1]
    return(null.dist)
  }
  # 计算RCbray的主功能(Calculate the main function of RCbray)
  Calc_RCbray <- function(physeq, reps, nproc){
    # Get OTU table from phyloseq object
    otu.table = otu_table(physeq)
    # Get alpha diversity for each sample
    otu.PA.table = otu.table
    otu.PA.table[otu.PA.table > 0] = 1
    alpha.df = data.frame(Sample_ID = colnames(otu.PA.table), OTU.n = colSums(otu.PA.table), stringsAsFactors = F)
    # Get beta diversity matrix
    beta.table = as.matrix(vegdist(t(otu.PA.table), method="bray", diag=TRUE, upper=TRUE))
    ## Get metacommunity
    # Calculate the number of individuals in the meta community (Average read depth)
    N <- mean(apply(t(otu.table), 1, sum))
    # Calculate the average relative abundance of each taxa across communities
    p.m <- apply(t(otu.table), 2, mean)
    p.m <- p.m[p.m != 0]
    p <- p.m/N
    # Calculate the occurrence frequency of each taxa across communities
    otu.table.bi <- 1*(t(otu.table)>0)
    freq <- apply(otu.table.bi, 2, mean)
    freq <- freq[freq != 0]
    # Combine
    freq.abd.df = data.frame(p=p, freq=freq) %>%
      tibble::rownames_to_column(var="OTU") %>%
      filter(p != 0, freq != 0) %>%
      arrange(p)
    # For each pair of samples run the RCbray analysis
    comps = combn(alpha.df$Sample_ID, m=2, simplify = F)
    RCb.df = data.frame(Site1 = character(), Site2 = character(), RCb = numeric(), stringsAsFactors = F)
    for (j in seq(1, length(comps))){
      sam = comps[[j]]
      alpha1 = alpha.df[alpha.df$Sample_ID == sam[1],]$OTU.n
      alpha2 = alpha.df[alpha.df$Sample_ID == sam[2],]$OTU.n
      # Permute "reps" many times
      rep.list = seq(1, reps)
      null.list = mclapply(rep.list, RCbray_null_func, freq.abd.df=freq.abd.df, alpha1=alpha1, alpha2=alpha2, N=N, mc.cores=nproc)
      RCb = (length(null.list[null.list > beta.table[sam[1], sam[2]]]) + (0.5*length(null.list[null.list == beta.table[sam[1], sam[2]]])))/reps
      RCb = (RCb - 0.5)*2
      RCb.df = rbind(RCb.df, data.frame(Site1=sam[1], Site2=sam[2], RCb=RCb, stringsAsFactors = F))
    }
    RCb.df
    return(RCb.df)
  }
  # 运行RCbray的计算,这个运算再5个小时左右999重复
  # Run the RCbray calculation, this operation will be repeated 999 times in about 5 hours
  RCb = Calc_RCbray(psrare, num, thread)
  head(RCb)
  return(list(RCb))
}

ps = readRDS("./data/dataNEW/ps_16s.rds")
psphy = filter_taxa(ps, function(x) sum(x ) > 1000 , TRUE);psphy
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 597 taxa and 18 samples ]
sample_data() Sample Data:       [ 18 samples by 2 sample variables ]
tax_table()   Taxonomy Table:    [ 597 taxa by 7 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 597 tips and 595 internal nodes ]
refseq()      DNAStringSet:      [ 597 reference sequences ]
result = RCbary(ps = psphy ,group  = "Group",num = 10,thread = 1)
RCbary = result[[1]]
head(RCbary)
    Site1    Site2 RCb
1 sample1 sample10   1
2 sample1 sample11   1
3 sample1 sample12   1
4 sample1 sample13   1
5 sample1 sample14   1
6 sample1 sample15   1
filename = paste(phypath,"/5_RCb.csv",sep = "")
write.csv(RCbary,filename)

βNTI and RCbray combination analysis(βNTI和RCbray联合出图)

bNTIRCPlot = function(otu = NULL,tax = NULL,
                      map = NULL,tree = NULL ,
                      ps = NULL,
                      RCb  = RCb,bNTI = bNTI,group  = "Group"){

  ps = inputMicro(otu,tax,map,tree,ps,group  = group)
  ps
  psrare <- ps
  map = as.data.frame(sample_data(psrare))
  map$ID = row.names(map)
  sample_data(psrare) = map
  # Get habitat metadata and add it to the βNTI then merge with the RCbray dataset
  eco.meta1 = data.frame(sample_data(psrare)) %>%
    select(ID, Group) %>%
    dplyr::rename(Sample_1 = ID, Group_1 = Group)
  eco.meta2=data.frame(sample_data(psrare)) %>%
    select(ID, Group) %>%
    dplyr::rename(Sample_2 = ID, Group_2 = Group)
  # bNTI 匹配第一列和第二列的分组信息(bNTI matches the grouping information of the first and second columns)
  bNTI.df = inner_join(bNTI, eco.meta1) %>%
    inner_join(eco.meta2)
  # 合并两个数据(Merge two data)
  turnover.df = inner_join(bNTI.df, RCb)
  head(turnover.df)
  dim(turnover.df)
  #--------------合并文件保存(Merge file and save)
  # write.csv(turnover.df,"./Result/bNTI//bNTI_RCbray.csv")
  #-----按照分组统计作图(Plotting by group statistics)
  #------------bNIT作图(bNIT plotting)
  dim(bNTI.df)
  within.bNTI.df = bNTI.df %>%
    filter(Group_1 == Group_2) %>%
    mutate(Group = Group_1)
  head(within.bNTI.df )
  eco.bNTI.plot <- ggplot(within.bNTI.df, aes(x=Group, y=bNTI)) +
    geom_jitter(alpha = 0.1,color ="#984EA3") +
    geom_boxplot(outlier.shape=1,outlier.alpha = 0,fill = "#984EA3") +
    geom_hline(yintercept = 2, linetype=2, size=0.5) +
    geom_hline(yintercept = -2, linetype=2, size=0.5) +
    labs(x="", y="bNTI") +
    theme_classic() +
    theme(legend.position = "none",
          axis.text = element_text(size=12),
          axis.text.x = element_text(angle=45, hjust=1),
          axis.title = element_text(size=14))
  # 现在按照RCbray进行分开标记系统发育过程
  # Now follow RCbray to separate the marker phylogeny
  eco.turnover.df = turnover.df %>%
    filter(Group_1 == Group_2) %>%
    mutate(Group = Group_1)
  head(eco.turnover.df )
  ## Calculate the relative influence of each process
  eco.turnover.df = eco.turnover.df %>%
    mutate(process = ifelse(abs(bNTI) < 2,
                            ifelse(abs(RCb) < 0.95, "Drift",
                                   ifelse(RCb >= 0.95, "Dispersal Limited",
                                          ifelse(RCb <= -0.95, "Homogenizing Dispersal", "ERROR"))),
                            ifelse(bNTI >= 2, "Variable Selection",
                                   ifelse(bNTI <= -2, "Homogeneous Selection", "ERROR"))))
  eco.turnover.df$process = factor(eco.turnover.df$process, levels = c("Drift",
                                                                       "Dispersal Limited", "Homogenizing Dispersal",
                                                                       "Variable Selection", "Homogeneous Selection"))

  head(eco.turnover.df)
  #------计算每个组的系统发育过程中五个部分分别占有的比例(Calculate the proportion of each group in the phylogeny of the five parts)
  pre = eco.turnover.df %>%
    dplyr::group_by(Group, process) %>%
    dplyr::summarize(n_sites = n(),
                     perc=(n()/45)*100) %>%
    as.data.frame
  # head(numeco  )
   numeco <- pre %>%  dplyr::group_by(Group) %>% 
     dplyr::summarise(num = sum(n_sites))
   alleco <- pre %>% dplyr::left_join(numeco,by = "Group")
   alleco$perc =  alleco$n_sites/ alleco$num * 100
   sum.eco.turnover.df = alleco
  eco.turnover.plot = ggplot(sum.eco.turnover.df, aes(x=Group, y=perc, fill=process)) +
    geom_bar(stat="identity", color="black") +
    # scale_fill_manual(values = c("white", "grey75", "grey50", "black")) +
    labs(x="", y="Percent of pairs (%)", fill="Process") +
    theme_bw() +
    theme(panel.grid = element_blank(),
          axis.text = element_text(size=12),
          axis.text.x = element_text(angle=45, hjust=1),
          axis.title = element_text(size=14),
          legend.key.size = unit(10, "mm"),
          legend.text = element_text(size=12),
          legend.title = element_text(size=14))
  eco.turnover.plot
  # Merge the plots
  eco.plot = cowplot::plot_grid(eco.bNTI.plot, eco.turnover.plot,
                                rel_widths=c(0.6, 1), labels=c("A", "B"))
  eco.plot
  return(list( eco.bNTI.plot, eco.turnover.plot,eco.plot,turnover.df,sum.eco.turnover.df))
}

bNTI = read.csv(paste(phypath,"/4_bNTI.csv",sep = ""),row.names = 1)
head(bNTI)
  Sample_1 Sample_2    bMNTD mean_bMNTD sd_bMNTD  bNTI
1 sample10  sample1 0.004675   0.003640 0.001402 0.738
2 sample11  sample1 0.007456   0.006143 0.002386 0.550
3 sample12  sample1 0.002150   0.001382 0.000688 1.117
4 sample13  sample1 0.002822   0.001092 0.000873 1.980
5 sample14  sample1 0.001703   0.001007 0.000783 0.889
6 sample15  sample1 0.000923   0.000493 0.000330 1.304
# RCbray 数据读入,修改列名(RCbray data read in, modify column names)
RCb = read.csv(paste(phypath,"/5_RCb.csv",sep = ""),row.names = 1) %>%
  dplyr::mutate(Sample_1 = Site2, Sample_2 = Site1)
head(RCb)
    Site1    Site2 RCb Sample_1 Sample_2
1 sample1 sample10   1 sample10  sample1
2 sample1 sample11   1 sample11  sample1
3 sample1 sample12   1 sample12  sample1
4 sample1 sample13   1 sample13  sample1
5 sample1 sample14   1 sample14  sample1
6 sample1 sample15   1 sample15  sample1
result = bNTIRCPlot(ps = psphy ,RCb  = RCb,bNTI = bNTI,group  = "Group")

#--bNTI出图(bNTI out of the plot)
p3 <- result[[1]] 
p3

#RCbary可视化(RCbary Visualization)
p4 <- result[[2]] 
p4

#组合图片BNTI,RCbray(Combined images BNTI, RCbray)
p5 <- result[[3]]
p5

plotdata = result[[4]]
head(plotdata)
  Sample_1 Sample_2    bMNTD mean_bMNTD sd_bMNTD  bNTI Group_1 Group_2   Site1
1 sample10  sample1 0.004675   0.003640 0.001402 0.738  Group2  Group1 sample1
2 sample11  sample1 0.007456   0.006143 0.002386 0.550  Group2  Group1 sample1
3 sample12  sample1 0.002150   0.001382 0.000688 1.117  Group2  Group1 sample1
4 sample13  sample1 0.002822   0.001092 0.000873 1.980  Group3  Group1 sample1
5 sample14  sample1 0.001703   0.001007 0.000783 0.889  Group3  Group1 sample1
6 sample15  sample1 0.000923   0.000493 0.000330 1.304  Group3  Group1 sample1
     Site2 RCb
1 sample10   1
2 sample11   1
3 sample12   1
4 sample13   1
5 sample14   1
6 sample15   1
dat = result[[5]]
head(dat)
   Group            process n_sites   perc num
1 Group1  Dispersal Limited      12  80.00  15
2 Group1 Variable Selection       3  20.00  15
3 Group2  Dispersal Limited      15 100.00  15
4 Group3  Dispersal Limited      14  93.33  15
5 Group3 Variable Selection       1   6.67  15
filename = paste(phypath,"/6_bNTI_RCbray.csv",sep = "")
write.csv(plotdata,filename)

FileName <- paste(phypath,"6_bNTI", ".pdf", sep = "")
ggsave(FileName, p3,width =8,height = 6)

FileName <- paste(phypath,"6_RCbary", ".pdf", sep = "")
ggsave(FileName, p4,width = 6,height = 6)

FileName <- paste(phypath,"6_BNTI_RCbray", ".pdf", sep = "")
ggsave(FileName, p5,width = 12,height = 8)

FileName <- paste(phypath,"6_bNTI", ".png", sep = "")
ggsave(FileName, p3,width =8,height = 6)

FileName <- paste(phypath,"6_RCbary", ".png", sep = "")
ggsave(FileName, p4,width = 6,height = 6)

FileName <- paste(phypath,"6_BNTI_RCbray", ".png", sep = "")
ggsave(FileName, p5,width = 12,height = 8)

FileName <- paste(phypath,"6_RCbray.percent.csv", sep = "")
write.csv(dat,FileName, quote = F)

Environmental factors and βNTI correlation analysis(环境因子和βNTI相关分析)

EnvCorbNTI = function(otu = NULL,
                      tax = NULL,
                      map = NULL,
                      tree = NULL,
                      ps = NULL,
                      bNTIRC = RCbNTI,
                      env = env,
                      group  = "Group"){

  ps = inputMicro(otu,tax,map,tree,ps,group  = group)
  ps
  #------------定义相关性分析函数(Define the correlation analysis function)
  # df = data
  Sams.mantel.test = function(df, seed=NULL) {
    # Run mantel test to see if there is a correlation
    delta.mat = df %>%
      select(Sample_1, Sample_2, delta) %>%
      spread(Sample_2, delta)
    rownames(delta.mat) = delta.mat$Sample_1
    delta.mat$Sample_1 = NULL
    delta.mat = delta.mat[names(sort(rowSums(!is.na(delta.mat)), decreasing = FALSE)), names(sort(colSums(!is.na(delta.mat)), decreasing = TRUE))]
    delta.mat = as.dist(delta.mat)
    bNTI.mat = df %>%
      select(Sample_1, Sample_2, bNTI) %>%
      spread(Sample_2, bNTI)
    rownames(bNTI.mat) = bNTI.mat$Sample_1
    bNTI.mat$Sample_1 = NULL
    bNTI.mat = bNTI.mat[names(sort(rowSums(!is.na(bNTI.mat)), decreasing = FALSE)), names(sort(colSums(!is.na(bNTI.mat)), decreasing = TRUE))]
    bNTI.mat = as.dist(bNTI.mat)
    if (!(is.null(seed))){
      set.seed(seed)
    }
    bNTI.mat[is.na(bNTI.mat)] = 0
    mantel.res = vegan::mantel(delta.mat, bNTI.mat)
    return(mantel.res)
  }
  set.seed(72)  # setting seed for reproducibility
  psrare = rarefy_even_depth(ps)
  # 检查序列数量(Check the number of sequences)
  sample_sums(psrare)
  # 标准化数据(Standardized data)
  ps.norm = transform_sample_counts(psrare, function(x) x/sum(x))
  map = as.data.frame(sample_data(psrare))
  # map = data.frame(row.names = map$id,id = map$id,Group = map$Group)
  mapE =merge(map,env,by = "row.names",all= FALSE)
  row.names(mapE) = mapE$Row.names
  mapE$Row.names = NULL
  mapE$ID = row.names(mapE)
  head(mapE)
  #---------合并环境变量数据(Merge environment variable data)
  # i = "Altitude..m."
  plot = list()
  for (i in colnames(env)) {
    colnames(mapE) = gsub(i,"XX",colnames(mapE))
    # Add in pH metadata
    pH.meta1=mapE %>%
      dplyr::select(ID, XX) %>%
      dplyr::rename(Sample_1 = ID, env1_1 = XX)
    pH.meta2= mapE%>%
      dplyr::select(ID, XX) %>%
      dplyr::rename(Sample_2 = ID, env1_2 = XX)
    data = dplyr::inner_join(bNTIRC, pH.meta1) %>%
      dplyr::inner_join(pH.meta2) %>%
      dplyr::mutate(delta = abs(env1_1-env1_2),
             crosstype = ifelse(Group_1 == Group_2, as.character(Group_1), "across"))
    head(data)
    data$crosstype
    # Run mantel test to see if there is a correlation
    pH.mantel = Sams.mantel.test(data, seed=72)
    head(data)
    # Plot
    p = ggplot(data, aes(x=delta, y=bNTI)) +
      geom_point(pch = 21) +
      # scale_shape_manual(values=LandUse.shapes) +
      geom_hline(yintercept = 2, linetype=2) +
      geom_hline(yintercept = -2, linetype=2) +
      # annotate("text", x=3.25, y=12.5, label=paste("r= ", round(pH.mantel$statistic, 3), "\n", "p= ", round(pH.mantel$signif, 3), sep="")) +
      labs(x=paste("",i), y="βNTI",title = paste("r= ", round(pH.mantel$statistic, 3), "p= ", round(pH.mantel$signif, 3))) +
      theme(legend.position = "none") +theme_bw()
    p
    plot[[i]] = p
    colnames(mapE) = gsub("XX",i,colnames(mapE))
  }
  library(ggpubr)
  p  = ggarrange(plotlist = plot, common.legend = TRUE, legend="right")
  p
  return(list(p,plot))
}

Sams.mantel.test = function(df, seed=NULL) {
  # Run mantel test to see if there is a correlation
  delta.mat = df %>%
    select(Sample_1, Sample_2, delta) %>%
    spread(Sample_2, delta)
  rownames(delta.mat) = delta.mat$Sample_1
  delta.mat$Sample_1 = NULL
  delta.mat = delta.mat[names(sort(rowSums(!is.na(delta.mat)), decreasing = F)), names(sort(colSums(!is.na(delta.mat)), decreasing = T))]
  delta.mat = as.dist(delta.mat)
  bNTI.mat = df %>%
    select(Sample_1, Sample_2, bNTI) %>%
    spread(Sample_2, bNTI)
  rownames(bNTI.mat) = bNTI.mat$Sample_1
  bNTI.mat$Sample_1 = NULL
  bNTI.mat = bNTI.mat[names(sort(rowSums(!is.na(bNTI.mat)), decreasing = F)), names(sort(colSums(!is.na(bNTI.mat)), decreasing = T))]
  bNTI.mat = as.dist(bNTI.mat)
  if (!(is.null(seed))){
    set.seed(seed)
  }
  bNTI.mat[is.na(bNTI.mat)] = 0
  mantel.res = vegan::mantel(delta.mat, bNTI.mat)
  return(mantel.res)
}

# df = data
Sams.mantel.test = function(df, seed=NULL) {
  # Run mantel test to see if there is a correlation
  delta.mat = df %>%
    select(sample1, sample2, delta) %>%
    spread(sample2, delta)
  rownames(delta.mat) = delta.mat$sample1
  delta.mat$sample1 = NULL
  delta.mat = delta.mat[names(sort(rowSums(!is.na(delta.mat)), decreasing = F)), names(sort(colSums(!is.na(delta.mat)), decreasing = T))]
  delta.mat = as.dist(delta.mat)
  bNTI.mat = df %>%
    select(sample1, sample2, bNTI) %>%
    spread(sample2, bNTI)
  rownames(bNTI.mat) = bNTI.mat$sample1
  bNTI.mat$sample1 = NULL
  bNTI.mat = bNTI.mat[names(sort(rowSums(!is.na(bNTI.mat)), decreasing = F)), names(sort(colSums(!is.na(bNTI.mat)), decreasing = T))]
  bNTI.mat = as.dist(bNTI.mat)
  if (!(is.null(seed))){
    set.seed(seed)
  }
  bNTI.mat[is.na(bNTI.mat)] = 0
  mantel.res = vegan::mantel(delta.mat, bNTI.mat)
  return(mantel.res)
}

#-导入bNTI函数(Import bNTI functions)
bNTIRC = read.csv(paste(phypath,"/6_bNTI_RCbray.csv",sep = ""),row.names = 1)
head(bNTIRC)
  Sample_1 Sample_2    bMNTD mean_bMNTD sd_bMNTD  bNTI Group_1 Group_2   Site1
1 sample10  sample1 0.004675   0.003640 0.001402 0.738  Group2  Group1 sample1
2 sample11  sample1 0.007456   0.006143 0.002386 0.550  Group2  Group1 sample1
3 sample12  sample1 0.002150   0.001382 0.000688 1.117  Group2  Group1 sample1
4 sample13  sample1 0.002822   0.001092 0.000873 1.980  Group3  Group1 sample1
5 sample14  sample1 0.001703   0.001007 0.000783 0.889  Group3  Group1 sample1
6 sample15  sample1 0.000923   0.000493 0.000330 1.304  Group3  Group1 sample1
     Site2 RCb
1 sample10   1
2 sample11   1
3 sample12   1
4 sample13   1
5 sample14   1
6 sample15   1
map = sample_data(psphy)
# head(map)
plot = EnvCorbNTI(ps = psphy,
                  bNTIRC = bNTIRC,
                  group  = "Group",
                  env = envRDA
                  )

## 提取相关分析结果,总图(Extract relevant analysis results, the overall picture)
p6 <- plot[[1]]
p6

# 提取单个(single)
# plot[[2]][1]

FileName <- paste(phypath,"7_env_corWithBNTI", ".pdf", sep = "")
ggsave(FileName, p6,width = 16,height = 14)

FileName <- paste(phypath,"7_env_corWithBNTI", ".png", sep = "")
ggsave(FileName, p6,width = 16,height = 14)